In [198]:
import zstandard as zstd
import pandas as pd
import networkx as nx
import json
import queue

### Channels that we consider

In [227]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")

In [228]:
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31


In [229]:
channelcrawler['channel_id'] = channels_wanted['link'].str.split('/').str[-1]

In [230]:
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [231]:
channelcrawler[channelcrawler['channel_id'] == 'UCzzzrOhp92PkGrIwGH3_EEg']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
94338,Gaming,2018-04-22,http://www.youtube.com/channel/UCzzzrOhp92PkGr...,Holdik Stream,142000,35,UCzzzrOhp92PkGrIwGH3_EEg


In [233]:
# The set of channels
set_channelcrawler = channelcrawler['channel_id'].unique()

In [234]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

### Reading at the comments file

In [183]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


Initialise a graph with one edge per channel (for example using networkx), then read each individual user trajectory out of the large file as a sequence of channels (for example, a user that visited channel A, then channel B, then channel C can be summarized as a trajectory A->B->C). For the simple approach, we simply consider each sequential pair of channels that each user visited here: (A,B) and (B,C) and add a a directed edge with weight 1 between them: A->B, B->C (or add 1 to the weight of already existing edges).

In [261]:
'''
Initialize the graph
'''
def initialize_graph():
    graph = nx.DiGraph()
    graph.add_nodes_from(set_channelcrawler)
    return graph
graph = initialize_graph()

In [262]:
def add_edge(graph, user_edge):
    source = user_edge[0]
    dest = user_edge[1]
    if graph.has_edge(*user_edge):
        graph[source][dest]['weight'] += 1
    else:
        graph.add_edge(source, dest, weight=1)

In [263]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

idx = 1
first_user = True
user_edge = queue.Queue(maxsize=0) # queue corresponding to the an edge

user = 'author_id'

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if idx == 1:
        print(line_split)
        idx += 1
        
    else:
        if vid_to_channels.get(line_split[2]) is not None:
            if first_user:
                user = line_split[0]
                first_user = False
            if line_split[0] == user:
                user_edge.put(vid_to_channels[line_split[2]])
                
                if len(user_edge.queue) == 2:
                    add_edge(graph, tuple(user_edge.queue))
                elif len(user_edge.queue) == 3:
                    user_edge.get()
                    add_edge(graph, tuple(user_edge.queue))
            else:
                user_edge = queue.Queue(maxsize=0)
                user_edge.put(vid_to_channels[line_split[2]])
                user = line_split[0]
                idx += 1

            if idx % 4 == 0:
                break

['author_id', 'id', 'video_id', 'parent_id', 'crawled_at', 'likes', 'replies', 'author', 'content']


In [260]:
list(graph.edges())

[('UCBXNpF6k2n8dsI6nBH8q4sQ', 'UCpB959t8iPrxQWj7G6n0ctQ'),
 ('UCM2ERkgV3P1_6MAyxa51rxA', 'UCtwD0AlYSlAYv7eXu8UxtEg'),
 ('UCpB959t8iPrxQWj7G6n0ctQ', 'UCpB959t8iPrxQWj7G6n0ctQ'),
 ('UCpB959t8iPrxQWj7G6n0ctQ', 'UCM2ERkgV3P1_6MAyxa51rxA')]

### Second version

In [None]:
set_videos = set()
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

video_ids = set()

idx = 1
first_user = True
user_edge = queue.Queue(maxsize=0) # queue corresponding to the an edge

user_data = []
user = 'author_id'

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if idx == 1:
        print(line_split)
        idx += 1
        
    else:
        if vid_to_channels.get(line_split[2]) is not None:
            if first_user:
                user = line_split[0]
                first_user = False
            if line_split[0] == user:
                user_data.append(vid_to_channels[line_split[2]])
            else:
                user_edge = queue.Queue(maxsize=0)
                user = line_split[0]
                print(user_data)
                user_data = []
                idx += 1

            if idx % 4 == 0:
                break