In [1]:
import json
import time
import pickle
import scipy.sparse
import sys
import random
import itertools
import os
import math
import glob

import zstandard as zstd
import pandas as pd
import numpy as np

from scipy.sparse import dok_matrix

In [2]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [3]:
# Channels that are in set_crawler dataset and also in which the language is in english
with open('/dlabdata1/youtube_large/olam/channels_id_filtered.pickle', 'rb') as f:
    channels_id = pickle.load(f)
f.close()
# Dictionnary to map the channel id to an integer corresponding to the column/row of the sparse matrix.
channel_dict = {}
channels_id = sorted(channels_id)
for ind, channel_id in enumerate(channels_id):
    channel_dict[channel_id] = ind
channels_id = set(channels_id)

In [4]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


In [5]:
# Row and columns length of the sparse matrix
matrix_len = len(channels_id)

In the goal of evaluating the the similarity between channels we created what we call a proximity graph.
As the dataset is sorted according to each user we sequentially consider each user:
For each user:
	We select the channels this user commented in.
		
And then, since our comments don’t have a time reference when it was posted we decided to construct edges of the graph by doing all the possible combinations out of the selected channels.

In [6]:
'''
Function to add new edge
    PARAMETERS:
        - graph_dict: dictionnary mapping the edge (tuple of channel indices) with the weight of that edge
        - user_edge_channel_id: new edge to be added in graph_dict
'''

def create_edges(graph_dict, user_edge):
    
    user_edge = list(set(user_edge))
    sorted_user_edge = sorted(user_edge)
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1
            else:
                graph_dict[user_edge] = 1
            
def create_edges_limited(graph_dict, user_edge):
    
    user_edge = list(set(user_edge))
    if len(user_edge) > 100:
        user_edge = random.sample(user_edge, 100)
    
        
    sorted_user_edge = sorted(user_edge)
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])

            if user_edge in graph_dict:
                graph_dict[user_edge] += 1
            else:
                graph_dict[user_edge] = 1
        
            

In [7]:
def create_edges_normalized(graph_dict, user_edge):
    
    nb_comments = len(user_edge)
    user_edge = list(set(user_edge))
    sorted_user_edge = sorted(user_edge)
    
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):

            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1/(math.log(nb_comments, 2))
            else:
                graph_dict[user_edge] = 1/(math.log(nb_comments, 2))

            
def create_edges_limited_normalized(graph_dict, user_edge):
    
    nb_comments = len(user_edge)
    user_edge = list(set(user_edge))
    if len(user_edge) > 100:
        user_edge = random.sample(user_edge, 50)
    
    if len(user_edge) >= 2:
        sorted_user_edge = sorted(user_edge)
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1/(math.log(nb_comments, 2))
            else:
                graph_dict[user_edge] = 1/(math.log(nb_comments, 2))

In [8]:
def check_directory(dir_1):
    
    if not os.path.exists(dir_1): 
        os.makedirs(dir_1)

In [9]:
with open("/dlabdata1/youtube_large/jouven/occurent_users.pkl",'rb') as f:
    set_occurent_users = pickle.load(f)
f.close()

In [10]:
dict_user_occurence = {}
for val in set_occurent_users:
    dict_user_occurence[val] = 0
set_occurent_users = set(set_occurent_users)

In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# parameters
graph_dict_limited = {}
nb = 1
idx = 1
user_channels = []
user_idx = -1
user_bool = True

user = ''
begin_time = time.time()

dir_1 = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_not_normalized/'
check_directory(dir_1)

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        if vid_to_channels.get(line_split[2]) in channels_id:
            corr_channel = vid_to_channels[line_split[2]]
            if user_idx in dict_user_occurence:
                dict_user_occurence[user_idx] += 1
                if dict_user_occurence[user_idx] > 1:
                    user_bool = False
            else:
                user_bool = True
                
            if user_bool:
                if line_split[0] == user:
                    user_channels.append(channel_dict[corr_channel])
                else:
                    if len(user_channels) >= 2:
                        create_edges(graph_dict_limited, user_channels)
                    #print(graph_dict_limited)
                    if len(graph_dict_limited) >= 75000000:
                        # For space requirements every 100 millions line create a dok matrix and
                        # update it with the graph_dict dictionnary and then save it into csr format and then release memory
                        graph_matrix = dok_matrix((matrix_len, matrix_len))
                        dict.update(graph_matrix, graph_dict_limited)
                        #print('Size of matrix limited dok: ' + str(sys.getsizeof(graph_matrix)))
                        print(graph_matrix.count_nonzero())
                        print((graph_matrix.tocsr()).count_nonzero())
                        graph_dict_limited = {}
                        # Save sparse matrix
                        scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_not_normalized/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
                        with open("/dlabdata1/youtube_large/jouven/sparse_matrix_construction/idx.pkl",'wb') as f:
                             pickle.dump([idx], f)
                        f.close()
                        graph_matrix = []
                        nb += 1
                        print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
                        begin_time = time.time()

                    user_channels = []
                    user_channels.append(channel_dict[corr_channel])
                    user_idx += 1
            else:
                if line_split[0] != user:
                    user_idx += 1
    user = line_split[0]
    idx += 1
    

graph_matrix = dok_matrix((matrix_len, matrix_len))
dict.update(graph_matrix, graph_dict_limited)
print('Size of matrix limited dok: ' + str(sys.getsizeof(graph_matrix)))
graph_dict_limited = {}
# Save sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_not_normalized/matrice' + str(nb) + '.npz', graph_matrix.tocsr())

graph_matrix = []


75003284
75003284
line number: 11610351 time: 457.410835981369
75043406
75043406
line number: 22631427 time: 438.7094466686249
75031026
75031026
line number: 34181573 time: 450.247287273407
75431530
75431530
line number: 45289236 time: 428.6098692417145
75002842
75002842
line number: 56874227 time: 469.23018431663513
75003890
75003890
line number: 68749190 time: 496.36649656295776
75000022
75000022
line number: 80107999 time: 483.4092221260071
75347892
75347892
line number: 90798800 time: 473.2226197719574
75000060
75000060
line number: 102582969 time: 430.93677973747253
75012359
75012359
line number: 114022771 time: 510.14119267463684
75043388
75043388
line number: 125422838 time: 417.6785657405853
75010113
75010113
line number: 136176314 time: 449.5446572303772
75000987
75000987
line number: 147673788 time: 420.4718449115753
75194874
75194874
line number: 159603977 time: 521.9681289196014
75001352
75001352
line number: 170869853 time: 534.2935223579407
75062682
75062682
line number: 

In [None]:
'''
Enable to load every sparse matrix to form only one by adding the weights of all intermediate sparse matrix
'''
graph = dok_matrix((matrix_len, matrix_len)).tocsr()
path = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_limited_normalized_50/'
files = glob.glob(path + '*.npz')
for file in files: 
    graph += scipy.sparse.load_npz(file)
    
# Save the final sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/final_embedding_sparse_matrix/sparse_matrix_limited_normalized_50.npz', graph)


In [53]:
set_occurent_users = set(set_occurent_users)

In [38]:
set_occurent_users = []

import glob
path = '/dlabdata1/youtube_large/jouven/users/'
files = glob.glob(path + '*.npz')   
idx = 0
last_max = 0
for i in range(1, 911):
    with open('/dlabdata1/youtube_large/jouven/users/users_to_idx_'+str(i)+'.pkl', 'rb') as f:
        channels_id = pickle.load(f)
    f.close()
    if len(channels_id) > 0:
        key_max = max(channels_id.keys(), key=(lambda k: channels_id[k]))
        max_value = channels_id[key_max]
        key_min = min(channels_id.keys(), key=(lambda k: channels_id[k]))
        min_value = channels_id[key_min]
        if idx >= 1:
            array = list(np.arange(last_max, min_value+1))
            set_occurent_users += array
        array = set(np.arange(min_value, max_value+1))
        set_channels = set(channels_id.values())
        set_occurent_users += list(array-set_channels)
        
        last_max = max_value
        idx += 1

In [52]:
dict_user_occurence = {}
for val in set_occurent_users:
    dict_user_occurence[val] = 0

In [42]:
with open('/dlabdata1/youtube_large/jouven/occurent_users.pkl', 'wb') as f:
    pickle.dump(set_occurent_users, f)
f.close()

In [14]:
a = set(np.arange(3, 8))
b = set([4, 5])

In [18]:
a-b

{3, 6, 7}

In [34]:
a = [1, 2, 3]
b = list(np.arange(5, 6))

In [35]:
print(a+b)

[1, 2, 3, 5]
