## NOTEBOOK DESCRIPTION:

This notebook construct the bipartite graph between the channels and the users with tf-idf occurences using sparse matrices.

In [1]:
import time
import pickle
import math
import scipy
import glob
import sys
import os

import zstandard as zstd
import pandas as pd
import numpy as np

from scipy.sparse import dok_matrix

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_300 import *

In [2]:
# Set of duplicate users
duplicate_users = dict_occurent_users()

#### Helpers dictionarries

In [3]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = video_id_to_channel_id()

In [2]:
# Channels that are in set_crawler dataset and also in which the language is in english
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [5]:
# Dictionnary mapping the channel index to the number of users that commented in this channel
with open("/dlabdata1/youtube_large/jouven/channels_more_10k/nb_users_per_channel_more_10k.pkl",'rb') as f:
     nb_users_per_channel = pickle.load(f)
f.close()

#### Create the bipartite graph between the users and the channels using sparse matrices

In [6]:
'''
Compute the tf-idf weight between the channels that user_idx has commented and user_idx.

PAREMETERS:
    - graph_dict: dictionnary containing the edges already computed {channel index -> user index}
    - user_idx: the user index that we consider
    - user_channels_occurences: Dictionnary mapping the channels where user_idx has commented to, to 
    the number of comments user_idx did on this channel
    - nb_users: Total number of users
    
'''
def tf_idf_weight_occurences(graph_dict, user_idx, user_channels_occurences, nb_users):
    max_freq = max(user_channels_occurences.values())
    for channel in set(user_channels_occurences.keys()):
        tf = user_channels_occurences[channel] / max_freq
        idf = math.log(nb_users / nb_users_per_channel[channel])
        graph_dict[(channel, user_idx)] = tf * idf
        
        
'''
Update the dictionnary `user_channels_occurences` keeping track of the occurences of the selected channels which are
the channels that a user has commented.

PARAMETERS:
    - user_channels_occurences: Dictionnary: channel index -> # of times this channel has been called
    - channel_idx: The channel index that we want to insert into user_channels_occurences
'''
def update_user_channel_occurences(user_channels_occurences, channel_idx):
    # If the channel already exists in the dictionnary: update the # of occurence
    if channel_idx in user_channels_occurences:
        user_channels_occurences[channel_idx] += 1
    else:
        user_channels_occurences[channel_idx] = 1

In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS

# Dictionnary containing channel, user tuple
graph_dict = {}
# Dictionarray counting the numbers of comments of a given user on channels
user_channels_occurences = {}

user = ''
begin_time = time.time()
# Numbers of channels
nb_channels = len(channels_id)
# Numbers of users
nb_users = 406925230
# Users idx
user_idx = -1
# Indices used for prints
idx = 1
nb = 1

dir_1 = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/sparse_matrices_bipartite_channel_more_10k'
check_directory(dir_1)

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            channel_idx = dict_channel_ind[vid_to_channels[line_split[2]]]
            
            # If an edge from the same author is processed.
            if author_id == user:
                
                if author_id in duplicate_users:
                    # If the author is part of the set of duplicate users then we have to make sure
                    # that it is the first time that we see that user 
                    if duplicate_users[author_id] <= 1:
                        update_user_channel_occurences(user_channels_occurences, channel_idx)
                else:
                    update_user_channel_occurences(user_channels_occurences, channel_idx)
            else:
                if len(user_channels_occurences) > 0:
                    tf_idf_weight_occurences(graph_dict, user_idx, user_channels_occurences, nb_users)
                
                user_channels_occurences = {}
                
                # If the author is part of the set of duplicate users
                if author_id in duplicate_users:
                    # Add one to this user to indicate that this duplicate user has already been processed
                    duplicate_users[author_id] += 1
                    # we have to make sure that it is the first time that we see that user 
                    if duplicate_users[author_id] <= 1:
                        user_idx += 1
                        update_user_channel_occurences(user_channels_occurences, channel_idx)
                        
                else:
                    user_idx += 1
                    update_user_channel_occurences(user_channels_occurences, channel_idx)
                    
                    

    if len(graph_dict) >= 100000000:
        # For space requirements every 100 millions line create a dok matrix and
        # update it with the graph_dict dictionnary and then save it into csr format and then release memory
        graph_matrix = dok_matrix((nb_channels, nb_users))
        dict.update(graph_matrix, graph_dict)
        graph_dict = {}
        # Save sparse matrix
        scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/sparse_matrices_bipartite_channel_more_10k/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
        with open("/dlabdata1/youtube_large/jouven/sparse_matrix_construction/idx_bip.pkl",'wb') as f:
             pickle.dump([idx], f)
        f.close()
        graph_matrix = []
        nb += 1
        print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
        begin_time = time.time()

    user = line_split[0]
    idx += 1
    
    

graph_matrix = dok_matrix((nb_channels, nb_users))
dict.update(graph_matrix, graph_dict)
graph_dict = {}
# Save sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/sparse_matrices_bipartite_channel_more_10k/matrice' + str(nb) + '.npz', graph_matrix.tocsc())
graph_matrix = []

line number: 365366768 time: 1870.8561975955963
line number: 730660217 time: 1863.5573751926422


Since we splitted the data corresponding to the bipartite graph in many small sparse matrices, we have to sum the sparse matrices to create a single one having all the data

In [4]:
nb_channels = len(channels_id)
nb_users = 406925230

graph = dok_matrix((nb_channels, nb_users)).tocsr()
path = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/sparse_matrices_bipartite_channel_more_10k/'
files = glob.glob(path + '*.npz')
for file in files: 
    graph += scipy.sparse.load_npz(file)

In [5]:
# Save the final sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/final_embedding_sparse_matrix/channels_more_10k/sparse_matrix_bipartite_channels_more_10k.npz', graph)