In [1]:
import json
import queue
import time
import pickle
import scipy.sparse
import sys
import random
import itertools
import os

import zstandard as zstd
import pandas as pd
import numpy as np

from scipy.sparse import dok_matrix

In [2]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [3]:
# Channels that are in set_crawler dataset and also in which the language is in english
with open('../../../dlabdata1/youtube_large/olam/channels_id.pickle', 'rb') as f:
    channels_id = pickle.load(f)
f.close()

In [4]:
# Dictionnary to map the channel id to an integer corresponding to the column/row of the sparse matrix.
channel_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_dict[channel_id] = ind

In [5]:
channel_dict

{'UCUd_ZdE1xZnw3ZokDNVQRfA': 0,
 'UCBx85zuCdOQLOJEYpXgxKWg': 1,
 'UCAWUxHxJimnDkIYc8bWytFA': 2,
 'UCVXv8_dfJjqWluEFktmv12A': 3,
 'UCvBxvcqGSQkbs1BRafBIYSg': 4,
 'UCVtVt9DiH_fi2MMEfoCp3vQ': 5,
 'UCn409uMO5y-DttgR6lLJ1uQ': 6,
 'UCT5kUJ_k4gGDC6wxHFbkFMg': 7,
 'UCvTflPL_YBQdrHcKFI5xdYg': 8,
 'UCMHXMAeKkI6HXlPfLiYvo9g': 9,
 'UCOJEt1dxJYH1N6jnLfB6SeA': 10,
 'UCDD7W0KTFFXB9xbgjkt4yog': 11,
 'UCtOtfVsLUae569f6VUlPp0A': 12,
 'UCLaw3lSosehevwAzy6DtULQ': 13,
 'UCq6YOKEm3iw-kUev-ndCgaw': 14,
 'UCQmUR3tO2FLpCoVKuAHzxiQ': 15,
 'UCGVLFaQ8KvBZee2WPzYnVfw': 16,
 'UCbLQvBCqdpYpddMcdgIWGVg': 17,
 'UC__r9g8m5h4STrbaCjQQDbw': 18,
 'UC0SdCZAvbNsuc-0l7puyKuQ': 19,
 'UC6u4spnoeIUHtfUZxhG2RfQ': 20,
 'UCXfDZgMCqcyVcGCD6aZSjxA': 21,
 'UC9t_mns3yRuuEWAwtzQeGqA': 22,
 'UCEPsNDUhUm-7yZhUjQQNqwQ': 23,
 'UCTUsCFCgSjtlkmjQjbpMkNw': 24,
 'UC0w4rj4DdKmJbSFbAjJPRFw': 25,
 'UCHMKHitDB2Ij49zzQXKvAxw': 26,
 'UCc3OMr14nJEOi0B-dsA5Wnw': 27,
 'UCyQ2JR_JKR6QdBLuLF4FWIg': 28,
 'UC1nGy8IXLKqir7q4STHOdLA': 29,
 'UCtykdsdm9cBfh5JM8

In [6]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


In [7]:
# Row and columns length of the sparse matrix
matrix_len = len(channels_id)

In the goal of evaluating the the similarity between channels we created what we call a proximity graph.
As the dataset is sorted according to each user we sequentially consider each user:
For each user:
	We select the channels this user commented in.
		
And then, since our comments don’t have a time reference when it was posted we decided to construct edges of the graph by doing all the possible combinations out of the selected channels.

In [8]:
'''
Function to add new edge
    PARAMETERS:
        - graph_dict: dictionnary mapping the edge (tuple of channel indices) with the weight of that edge
        - user_edge_channel_id: new edge to be added in graph_dict
'''

def create_edges(graph_dict, user_edge):
    
    user_edge = list(set(user_edge))
    sorted_user_edge = sorted(user_edge)
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1
            else:
                graph_dict[user_edge] = 1
            
def create_edges_limited(graph_dict, user_edge):
    
    user_edge = list(set(user_edge))
    if len(user_edge) > 100:
        user_edge = random.sample(user_edge, 100)
    
        
    sorted_user_edge = sorted(user_edge)
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])

            if user_edge in graph_dict:
                graph_dict[user_edge] += 1
            else:
                graph_dict[user_edge] = 1
        
            

In [9]:
def create_edges_normalized(graph_dict, user_edge):
    
    nb_comments = len(user_edge)
    user_edge = list(set(user_edge))
    sorted_user_edge = sorted(user_edge)
    
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):

            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1/(math.log(nb_comments, 2))
            else:
                graph_dict[user_edge] = 1/(math.log(nb_comments, 2))

            
def create_edges_limited_normalized(graph_dict, user_edge):
    
    nb_comments = len(user_edge)
    user_edge = list(set(user_edge))
    if len(user_edge) > 100:
        user_edge = random.sample(user_edge, 100)
    
    if len(user_edge) >= 2:
        sorted_user_edge = sorted(user_edge)
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1/(math.log(nb_comments, 2))
            else:
                graph_dict[user_edge] = 1/(math.log(nb_comments, 2))

In [10]:
def check_directory(dir_1):
    
    if not os.path.exists(dir_1): 
        os.makedirs(dir_1)

In [11]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# parameters
graph_dict_limited = {}
nb = 1
idx = 1
user_channels = []

user = ''
begin_time = time.time()

dir_1 = '../../../dlabdata1/youtube_large/jouven/sparse_matrices_limited/'
check_directory(dir_1)

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        if vid_to_channels.get(line_split[2]) in channels_id:
            corr_channel = vid_to_channels[line_split[2]]
            if line_split[0] == user:
                user_channels.append(channel_dict[corr_channel])
            else:
                if len(user_channels) >= 2:
                    create_edges_limited(graph_dict_limited, user_channels)
                #print(graph_dict_limited)
                if len(graph_dict_limited) >= 100000000:
                    # For space requirements every 100 millions line create a dok matrix and
                    # update it with the graph_dict dictionnary and then save it into csr format and then release memory
                    graph_matrix = dok_matrix((matrix_len, matrix_len), dtype=np.uint8)
                    dict.update(graph_matrix, graph_dict_limited)
                    print('Size of matrix limited dok: ' + str(sys.getsizeof(graph_matrix)))
                    graph_dict_limited = {}
                    # Save sparse matrix
                    scipy.sparse.save_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrices_limited/matrice' + str(nb) + '.npz', graph_matrix.tocsr())

                    graph_matrix = []
                    nb += 1
                    print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
                    begin_time = time.time()
                
                user_channels = []
                user_channels.append(channel_dict[corr_channel])
                
    user = line_split[0]
    idx += 1
    

graph_matrix = dok_matrix((matrix_len, matrix_len), dtype=np.uint8)
dict.update(graph_matrix, graph_dict_limited)
print('Size of matrix limited dok: ' + str(sys.getsizeof(graph_matrix)))
graph_dict_limited = {}
# Save sparse matrix
scipy.sparse.save_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrices_limited/matrice' + str(nb) + '.npz', graph_matrix.tocsr())

graph_matrix = []


Size of matrix limited dok: 5368709240
line number: 42373928 time: 649.4229190349579
Size of matrix limited dok: 5368709240
line number: 84222810 time: 638.347460269928
Size of matrix limited dok: 5368709240
line number: 126241157 time: 660.987512588501
Size of matrix limited dok: 5368709240
line number: 168634588 time: 639.1796877384186
Size of matrix limited dok: 5368709240
line number: 210868843 time: 637.085568189621
Size of matrix limited dok: 5368709240
line number: 253257273 time: 833.4123694896698
Size of matrix limited dok: 5368709240
line number: 295565431 time: 914.4731848239899
Size of matrix limited dok: 5368709240
line number: 337691000 time: 747.7046039104462
Size of matrix limited dok: 5368709240
line number: 379928793 time: 725.7866387367249
Size of matrix limited dok: 5368709240
line number: 422012910 time: 727.3327612876892
Size of matrix limited dok: 5368709240
line number: 464260783 time: 735.3986999988556
Size of matrix limited dok: 5368709240
line number: 5062355

line number: 4061908297 time: 656.2832770347595
Size of matrix limited dok: 5368709240
line number: 4104131594 time: 678.5997030735016
Size of matrix limited dok: 5368709240
line number: 4146544696 time: 679.0403592586517
Size of matrix limited dok: 5368709240
line number: 4188605242 time: 644.819947719574
Size of matrix limited dok: 5368709240
line number: 4231118871 time: 650.9345409870148
Size of matrix limited dok: 5368709240
line number: 4273731921 time: 773.60125041008
Size of matrix limited dok: 5368709240
line number: 4316390536 time: 683.6960422992706
Size of matrix limited dok: 5368709240
line number: 4358375022 time: 674.8007941246033
Size of matrix limited dok: 5368709240
line number: 4400667307 time: 688.8127021789551
Size of matrix limited dok: 5368709240
line number: 4442823538 time: 667.1166181564331
Size of matrix limited dok: 5368709240
line number: 4485274348 time: 671.2076098918915
Size of matrix limited dok: 5368709240
line number: 4527553972 time: 666.040714979171

Size of matrix limited dok: 5368709240
line number: 8078583923 time: 661.0506911277771
Size of matrix limited dok: 5368709240
line number: 8120727699 time: 671.163868188858
Size of matrix limited dok: 5368709240
line number: 8162949931 time: 664.881097316742
Size of matrix limited dok: 5368709240
line number: 8205013649 time: 667.5764775276184
Size of matrix limited dok: 5368709240
line number: 8247477846 time: 688.8311886787415
Size of matrix limited dok: 5368709240
line number: 8289563495 time: 687.283887386322
Size of matrix limited dok: 5368709240
line number: 8332100393 time: 685.3922543525696
Size of matrix limited dok: 5368709240
line number: 8374567617 time: 676.7331066131592
Size of matrix limited dok: 5368709240
line number: 8417352635 time: 681.7582993507385
Size of matrix limited dok: 5368709240
line number: 8459746254 time: 668.1266622543335
Size of matrix limited dok: 5368709240
line number: 8502150146 time: 663.111524105072
Size of matrix limited dok: 5368709240
line num

In [2]:
'''
Enable to load every sparse matrix to form only one by adding the weights of all intermediate sparse matrix
'''
graph = scipy.sparse.load_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrices_limited/matrice' + str(1) + '.npz')
for i in range(2, 247):
    graph += scipy.sparse.load_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrices_limited/matrice' + str(i) + '.npz')

In [3]:
# Save the final sparse matrix
scipy.sparse.save_npz('../../../dlabdata1/youtube_large/jouven/sparse_matrix_limited.npz', graph)