In [2]:
import time
import pickle
import math
import scipy
import glob

import zstandard as zstd
import pandas as pd
import numpy as np

from scipy.sparse import dok_matrix
from helpers import *

#### Helpers dictionarries

In [3]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [3]:
# Channels that are in set_crawler dataset and also in which the language is in english
with open('/dlabdata1/youtube_large/olam/channels_id_filtered.pickle', 'rb') as f:
    channels_id = pickle.load(f)
f.close()
channels_id = sorted(channels_id)
# Dictionnary mapping the channel id to an integer corresponding to the row of the sparse matrix.
channel_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_dict[channel_id] = ind
channels_id = set(channels_id)

In [5]:
channel_dict

{'UC--24Q3_ZQeFmgJE-Um5QZQ': 0,
 'UC--2EbuIuBOGgGqPiJxi3rg': 1,
 'UC--3c8RqSfAqYBdDjIG3UNA': 2,
 'UC--3ozenTzry39-xMqAPQlw': 3,
 'UC--6E6EpybocdaNlPhvK_9A': 4,
 'UC--70ql_IxJmhmqXqrkJrWQ': 5,
 'UC--7oyGW0N7fMf164-ZXljQ': 6,
 'UC--94OGFAzYDOJb0eXgMjew': 7,
 'UC--BMyA2X4a9PGAo3lTuopg': 8,
 'UC--EwQJeJ6SKlJdswehQH4g': 9,
 'UC--HPaiKtsr5Ox2Bd7vJMKA': 10,
 'UC--LAVm36WiCmqQbekV17qQ': 11,
 'UC--MxpGXJ3LVD8KvlNzRlcA': 12,
 'UC--QN7syH6Tj_9jVWhLshzQ': 13,
 'UC--SaadDxRXz729rtbQqyBA': 14,
 'UC--TqCe_AxlW6MNGHwBWhng': 15,
 'UC--WESWtg3ttfb06iXfcmIA': 16,
 'UC--Wn0KWXRaChNrKChh9kYg': 17,
 'UC--ZshDJixmdaEZys42EXkg': 18,
 'UC--aOG_a2afAna8t3xvRkuw': 19,
 'UC--b720KocplK9zh0peruqg': 20,
 'UC--bRDxn5YWEiqs3p8zzvlw': 21,
 'UC--cWkBN0GxGcYs4IzQUxsA': 22,
 'UC--e1G28Uhx1XL4na8uxIhA': 23,
 'UC--g_wJaqDjFx57RPrtSpow': 24,
 'UC--i2rV5NCxiEIPefr3l-zQ': 25,
 'UC--j-yuiKFQhNAUYLt3v4fA': 26,
 'UC--jxlaAvQBLXKkvxecF8qQ': 27,
 'UC--kKSOGQLiREhjjagHz4-g': 28,
 'UC--lqu1v7hxajuC1TlmtT0Q': 29,
 'UC--mO_MwJryZfcxea

In [5]:
# Dictionnary mapping the channel index to the total number of comments in this channel
with open("/dlabdata1/youtube_large/jouven/channels_idx_to_nb_comments.pkl", 'rb') as f:
    channels_idx_to_nb_comments = pickle.load(f)
f.close()

#### Create the bipartite graph between the users and the channels using sparse matrices

In [6]:
def tf_idf_weight_occurences(graph_dict, user_idx, user_channels_occurences, nb_users):
    max_freq = max(user_channels_occurences.values())
    for channel in set(user_channels_occurences.keys()):
        tf = user_channels_occurences[channel] / max_freq
        idf = math.log(nb_users / channels_idx_to_nb_comments[channel])
        graph_dict[(channel, user_idx)] = tf * idf
        
def update_user_channel_occurences(users_channels, channel_idx):
    if channel_idx in users_channels:
        users_channels[channel_idx] += 1
    else:
        users_channels[channel_idx] = 1

In [12]:
with open("/dlabdata1/youtube_large/jouven/nb_users.pkl",'rb') as f:
     nb_users = pickle.load(f)
f.close()
print(nb_users)

[477003118]


In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS
# Dictionnary containing edges of the bipartite graph
graph_dict = {}
# Dicionarray counting the numbers of comments of a given user on channels
user_channels_occurences = {}
idx = 1
user = ''
begin_time = time.time()
nb_users = 436488207
nb_channels = len(channel_dict)
nb = 1
user_idx = -1

dir_1 = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite_correct'
check_directory(dir_1)

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        if vid_to_channels.get(line_split[2]) in channels_id:
            channel_idx = channel_dict[vid_to_channels[line_split[2]]]
            if line_split[0] == user:
                update_user_channel_occurences(user_channels_occurences, channel_idx)

            else:
                if len(user_channels_occurences) > 0:
                    tf_idf_weight_occurences(graph_dict, user_idx, user_channels_occurences, nb_users)
                user_idx += 1

                user_channels_occurences = {}
                update_user_channel_occurences(user_channels_occurences, channel_idx)
                
    if idx % 100000000 == 0:
        # For space requirements every 100 millions line create a dok matrix and
        # update it with the graph_dict dictionnary and then save it into csr format and then release memory
        graph_matrix = dok_matrix((nb_channels, nb_users))
        dict.update(graph_matrix, graph_dict)
        print('len(graph_dict) ', len(graph_dict))
        print(graph_matrix.count_nonzero())
        print(graph_matrix.tocsr().count_nonzero())
        graph_dict = {}
        # Save sparse matrix
        scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite_correct/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
        with open("/dlabdata1/youtube_large/jouven/sparse_matrix_construction/idx_bip.pkl",'wb') as f:
             pickle.dump([idx], f)
        f.close()
        graph_matrix = []
        nb += 1
        print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
        begin_time = time.time()

    user = line_split[0]
    idx += 1
    

graph_matrix = dok_matrix((nb_channels, nb_users))
dict.update(graph_matrix, graph_dict)
graph_dict = {}
# Save sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite_correct/matrice' + str(nb) + '.npz', graph_matrix.tocsc())

graph_matrix = []


len(graph_dict)  32234357
32234357
32234357
line number: 100000000 time: 492.20108222961426
len(graph_dict)  32221994
32221994
32221994
line number: 200000000 time: 504.52183866500854


In [18]:
channels_idx_to_nb_comments[150726]

KeyError: 150726

In [4]:
nb_users = 436488207
nb_channels = len(channel_dict)

In [5]:
graph = dok_matrix((nb_channels, nb_users)).tocsr()
path = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/sparse_matrices_bipartite_correct/'
files = glob.glob(path + '*.npz')
for file in files: 
    graph += scipy.sparse.load_npz(file)

In [6]:
# Save the final sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/final_embedding_sparse_matrix/sparse_matrix_bipartite.npz', graph)