In [5]:
import json
import time
import pickle
import scipy.sparse
import sys
import os
import random
import itertools
import math
import glob

import zstandard as zstd
import pandas as pd
import numpy as np

from scipy.sparse import dok_matrix

scriptpath = "../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers import *

In [6]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = video_id_to_channel_id()

In [7]:
# Channels with more than 10k comments
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [8]:
# Set of duplicate users
duplicate_users = dict_occurent_users()

In the goal of evaluating the the similarity between channels we created what we call a proximity graph.
As the dataset is sorted according to each user we sequentially consider each user:
For each user:
	We select the channels this user commented in.
		
And then, since our comments don’t have a time reference when it was posted we decided to construct edges of the graph by doing all the possible combinations out of the selected channels.

In [9]:
'''
Function to add new edge
    PARAMETERS:
        - graph_dict: dictionnary mapping the edge (tuple of channel indices) with the weight of that edge
        - user_edge_channel_id: new edge to be added in graph_dict
'''

def create_edges(graph_dict, user_edge):
    
    user_edge = list(set(user_edge))
    sorted_user_edge = sorted(user_edge)
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1
            else:
                graph_dict[user_edge] = 1
            
def create_edges_limited(graph_dict, user_edge):
    
    user_edge = list(set(user_edge))
    if len(user_edge) > 100:
        user_edge = random.sample(user_edge, 100)
    
        
    sorted_user_edge = sorted(user_edge)
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])

            if user_edge in graph_dict:
                graph_dict[user_edge] += 1
            else:
                graph_dict[user_edge] = 1
        
            

In [10]:
def create_edges_normalized(graph_dict, user_edge):
    
    nb_comments = len(user_edge)
    user_edge = list(set(user_edge))
    sorted_user_edge = sorted(user_edge)
    
    if len(user_edge) >= 2:
        for comb in itertools.combinations(sorted_user_edge, 2):

            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1/(math.log(nb_comments, 2))
            else:
                graph_dict[user_edge] = 1/(math.log(nb_comments, 2))

            
def create_edges_limited_normalized(graph_dict, user_edge):
    
    nb_comments = len(user_edge)
    user_edge = list(set(user_edge))
    if len(user_edge) > 100:
        user_edge = random.sample(user_edge, 50)
    
    if len(user_edge) >= 2:
        sorted_user_edge = sorted(user_edge)
        for comb in itertools.combinations(sorted_user_edge, 2):
            user_edge = (comb[0], comb[1])
            if user_edge in graph_dict:
                graph_dict[user_edge] += 1/(math.log(nb_comments, 2))
            else:
                graph_dict[user_edge] = 1/(math.log(nb_comments, 2))

In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS

# Dictionnary counting the number of time (channel_idx, channel2_idx) appears
graph_dict_limited = {}
# Indices
nb = 1
idx = 1
# Channels that a user have commented
user_channels = []
# Number of channels, Row and columns length of the sparse matrix
matrix_len = len(channels_id)

user = ''
begin_time = time.time()

dir_1 = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_normalized/'
check_directory(dir_1)

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            corr_channel = dict_channel_ind[vid_to_channels[line_split[2]]]
            if author_id == user:
                # if user is a duplicate user
                if author_id in duplicate_users:
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
            else:
                if len(user_channels) >= 2:
                    create_edges_normalized(graph_dict_limited, user_channels)
                user_channels = []
                
                
                if len(graph_dict_limited) >= 75000000:
                    # For space requirements every 75 millions line create a dok matrix and
                    # update it with the graph_dict dictionnary and then save it into csr format and then release memory
                    graph_matrix = dok_matrix((matrix_len, matrix_len))
                    dict.update(graph_matrix, graph_dict_limited)
                    graph_dict_limited = {}
                    # Save sparse matrix
                    scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_normalized/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
                    with open("/dlabdata1/youtube_large/jouven/sparse_matrix_construction/idx.pkl",'wb') as f:
                         pickle.dump([idx], f)
                    f.close()
                    graph_matrix = []
                    nb += 1
                    print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
                    begin_time = time.time()
                        
                # If user is a duplicate user
                if author_id in duplicate_users:
                    duplicate_users[author_id] += 1
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
           
    user = line_split[0]
    idx += 1
    

# Store the graph sparse matrix
graph_matrix = dok_matrix((matrix_len, matrix_len))
dict.update(graph_matrix, graph_dict_limited)
graph_dict_limited = {}
# Save sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_normalized/matrice' + str(nb) + '.npz', graph_matrix.tocsr())

graph_matrix = []


line number: 16391975 time: 353.99390864372253
line number: 32611555 time: 361.09104561805725
line number: 48851453 time: 355.9911952018738
line number: 65571372 time: 361.7573125362396
line number: 82285178 time: 372.3876950740814
line number: 98445923 time: 370.0951569080353
line number: 114797422 time: 370.25587224960327
line number: 130596556 time: 363.0002465248108
line number: 147251135 time: 373.1519811153412
line number: 164046703 time: 368.72952008247375
line number: 181193306 time: 366.0931329727173
line number: 197753733 time: 362.5507221221924
line number: 214182325 time: 355.13584661483765
line number: 231202737 time: 378.5303883552551
line number: 247794613 time: 369.0660300254822
line number: 264178397 time: 367.4828929901123
line number: 281666402 time: 372.63997507095337
line number: 298335403 time: 351.59055948257446
line number: 313865210 time: 346.08931255340576
line number: 330312904 time: 357.806275844574
line number: 346772597 time: 349.52210116386414
line number

In [None]:
'''
Enable to load every sparse matrix to form only one by adding the weights of all intermediate sparse matrix
'''
graph = dok_matrix((matrix_len, matrix_len)).tocsr()
path = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_limited_normalized_50/'
files = glob.glob(path + '*.npz')
for file in files: 
    graph += scipy.sparse.load_npz(file)
    
# Save the final sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/final_embedding_sparse_matrix/sparse_matrix_limited_normalized_50.npz', graph)
