In [None]:
import json
import queue
import time
import pickle
import gzip
import scipy.sparse
import sys

import zstandard as zstd
import pandas as pd
import networkx as nx
import numpy as np

from scipy.sparse import dok_matrix

In [None]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [None]:
# Channels that are in set_crawler dataset and also in which the language is in english
with open('../../dlabdata1/youtube_large/olam/channels_id.pickle', 'rb') as f:
    channels_id = pickle.load(f)
f.close()

In [None]:
# Dictionnary to map the channel id to an integer corresponding to the column/row of the sparse matrix.
channel_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_dict[channel_id] = ind

In [None]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


In [None]:
# Row and columns length of the sparse matrix
matrix_len = len(channels_id)

In [None]:
'''
Function to add new edge
    PARAMETERS:
        - graph_dict: dictionnary mapping the edge (tuple of channel indices) with the weight of that edge
        - user_edge_channel_id: new edge to be added in graph_dict
'''
def add_edge(graph_dict, user_edge_channel_id):
    user_edge_source = channel_dict[user_edge_channel_id[0]]
    user_edge_dest = channel_dict[user_edge_channel_id[1]]
    user_edge = (user_edge_source, user_edge_dest)
    
    if graph_dict.get(user_edge) is None:
        graph_dict[user_edge] = 1
    else:
        graph_dict[user_edge] += 1
        
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# parameters
graph_dict = {}
nb = 1
idx = 1
user_edge = queue.Queue(maxsize=0) # queue corresponding to the an edge

user = 'author_id'
begin_time = time.time()

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        if idx == 1:
            print(line_split)

        else:
            if vid_to_channels.get(line_split[2]) in channels_id:
                corr_channel = vid_to_channels[line_split[2]]
                if line_split[0] == user:
                    user_edge.put(corr_channel)

                    if len(user_edge.queue) == 2:
                        add_edge(graph_dict, user_edge.queue)
                    elif len(user_edge.queue) == 3:
                        user_edge.get()
                        add_edge(graph_dict, user_edge.queue)
                else:
                    user_edge = queue.Queue(maxsize=0)
                    user_edge.put(corr_channel)
                    user = line_split[0]

    idx += 1
    if idx % 100000000 == 0:
        # Every 100 millions line create a dok matrix, update it with the graph_dict dictionnary, save it 
        # into csr format and then release memory
        graph_matrix = dok_matrix((matrix_len, matrix_len), dtype=np.uint8)
        dict.update(graph_matrix, graph_dict)
        print('Size of matrix dok: ' + str(sys.getsizeof(graph_matrix)))
        graph_dict = {}
        # Save sparse matrix
        scipy.sparse.save_npz('../../../dlabdata1/youtube_large/jouven/matrices_t/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
        graph_matrix = []
        nb += 1
        print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
        begin_time = time.time()


graph_matrix = dok_matrix((matrix_len, matrix_len), dtype=np.uint8)
dict.update(graph_matrix, graph_dict)
# Save sparse matrix
scipy.sparse.save_npz('../../../dlabdata1/youtube_large/jouven/matrices_t/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
graph_dict = {}
graph_matrix = []

In [None]:
'''
Enable to load every sparse matrix to form only one by adding the weights of all sparse matrix
'''
graph = scipy.sparse.load_npz('../../dlabdata1/youtube_large/jouven/matrices/matrice' + str(1) +'.npz')
for i in range(2, 79):
    graph += scipy.sparse.load_npz('../../../dlabdata1/youtube_large/jouven/matrices/matrice' + str(i) +'.npz')
for i in range(79, 91):
    graph += scipy.sparse.load_npz('../../../dlabdata1/youtube_large/jouven/matrices_t/matrice' + str(i) +'.npz')
for i in range(91, 104):
    graph += scipy.sparse.load_npz('../../../dlabdata1/youtube_large/jouven/matrices_s/matrice' + str(i) +'.npz')

In [None]:
# Save the final sparse matrix
scipy.sparse.save_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrix_graph.npz', graph)