In [2]:
import time
import pickle
import math
import scipy
import glob
import sys
import os

import zstandard as zstd
import pandas as pd
import numpy as np

from scipy.sparse import dok_matrix

scriptpath = "../"
sys.path.append(os.path.abspath(scriptpath))
from helpers import *

In [11]:
# Set of channels that we consider
with open("/dlabdata1/youtube_large/jouven/channels_more_10k.pkl",'rb') as f:
    channels_id = pickle.load(f)
f.close()

In [None]:
# Set of duplicate users
duplicate_users = dict_occurent_users()

#### Helpers dictionarries

In [3]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [3]:
# Channels that are in set_crawler dataset and also in which the language is in english
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [5]:
# Dictionnary mapping the channel index to the total number of comments in this channel
with open("/dlabdata1/youtube_large/jouven/comments_per_channel_id.pkl", 'rb') as f:
    channels_idx_to_nb_comments = pickle.load(f)
f.close()

#### Create the bipartite graph between the users and the channels using sparse matrices

In [6]:
def tf_idf_weight_occurences(graph_dict, user_idx, user_channels_occurences, nb_users):
    max_freq = max(user_channels_occurences.values())
    for channel in set(user_channels_occurences.keys()):
        tf = user_channels_occurences[channel] / max_freq
        idf = math.log(nb_users / channels_idx_to_nb_comments[channel])
        graph_dict[(channel, user_idx)] = tf * idf
        
def update_user_channel_occurences(users_channels, channel_idx):
    if channel_idx in users_channels:
        users_channels[channel_idx] += 1
    else:
        users_channels[channel_idx] = 1

In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS

# Dictionnary containing channel, user tuple
graph_dict = {}
# Dictionarray counting the numbers of comments of a given user on channels
user_channels_occurences = {}

user = ''
begin_time = time.time()
# Numbers of channels
nb_channels = len(channels_id)
# Numbers of users
nb_users = 406925230
# Users idx
user_idx = -1
# Indices used for prints
idx = 1
nb = 1

dir_1 = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite'
check_directory(dir_1)

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            channel_idx = dict_channel_ind[vid_to_channels[line_split[2]]]
            
            if author_id == user:
                if author_id in duplicate_users:
                    if duplicate_users[author_id] <= 1:
                        update_user_channel_occurences(user_channels_occurences)
                else:
                    update_user_channel_occurences(user_channels_occurences)
            else:
                if len(user_channels_occurences) > 0:
                    tf_idf_weight_occurences(graph_dict, user_idx, user_channels_occurences, nb_users)
                
                user_channels_occurences = {}
                
                if author_id in duplicate_users:
                    duplicate_users[author_id] += 1
                    if duplicate_users[author_id] <= 1:
                        user_idx += 1
                        update_user_channel_occurences(user_channels_occurences)
                        
                else:
                    user_idx += 1
                    update_user_channel_occurences(user_channels_occurences)
                    
                    

    if idx % 75000000 == 0:
        # For space requirements every 75 millions line create a dok matrix and
        # update it with the graph_dict dictionnary and then save it into csr format and then release memory
        graph_matrix = dok_matrix((nb_channels, nb_users), np.uint16)
        dict.update(graph_matrix, graph_dict)
        graph_dict = {}
        # Save sparse matrix
        scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
        with open("/dlabdata1/youtube_large/jouven/sparse_matrix_construction/idx_bip.pkl",'wb') as f:
             pickle.dump([idx], f)
        f.close()
        graph_matrix = []
        nb += 1
        print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
        begin_time = time.time()

    user = line_split[0]
    idx += 1
    

graph_matrix = dok_matrix((nb_channels, nb_users), np.uint16)
dict.update(graph_matrix, graph_dict)
graph_dict = {}
# Save sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite/matrice' + str(nb) + '.npz', graph_matrix.tocsc())
graph_matrix = []

len(graph_dict)  32234357
32234357
32234357
line number: 100000000 time: 492.20108222961426
len(graph_dict)  32221994
32221994
32221994
line number: 200000000 time: 504.52183866500854


In [5]:
graph = dok_matrix((nb_channels, nb_users)).tocsr()
path = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite_correct/'
files = glob.glob(path + '*.npz')
for file in files: 
    graph += scipy.sparse.load_npz(file)

In [6]:
# Save the final sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/final_embedding_sparse_matrix/sparse_matrix_bipartite.npz', graph)