## NOTEBOOK DESCRIPTION:

This notebook creates a sparse matrix of size (#channels, #users). 
The value at position (channel_i, user_i) corresponds to the number of times user_i commented on channel_i.

In [1]:
import time
import pickle
import scipy
import glob
import sys
import os

import zstandard as zstd
import numpy as np

from scipy.sparse import dok_matrix

scriptpath = "../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_10k import *

In [2]:
# Set of duplicate users
duplicate_users = dict_occurent_users()

In [9]:
# Numbers of users
with open("/dlabdata1/youtube_large/jouven/channels_more_10k/nb_users.pkl",'rb') as f:
     nb_users = pickle.load(f)[0]
f.close()

#### Helpers dictionarries

In [4]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = video_id_to_channel_id()

In [5]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

#### Compute the number of times each (channel, user) pairs appear

In [6]:
'''
This function populates user_channels_occurences (dictionnary) containing the comments already processed for a 
given user

PARAMETER:
    - user_channels_occurences: dictionnary: {(channel index, user index): number of comments, ...}
    - channel_idx: the index of the channel processed
    - user_idx: the index of the user processed
'''            
def update_user_channel_occurences(user_channels_occurences, channel_idx, user_idx):
    if (channel_idx, user_idx) in user_channels_occurences:
        user_channels_occurences[(channel_idx, user_idx)] += 1
    else:
        user_channels_occurences[(channel_idx, user_idx)] = 1

In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS

# Dictionnary containing channel, user tuple
graph_dict = {}
# Dictionarray counting the numbers of comments for a given user
user_channels_occurences = {}

user = ''
begin_time = time.time()
# Numbers of channels
nb_channels = len(channels_id)
# Users idx
user_idx = -1
# Indices used for prints
idx = 1
nb = 1

dir_1 = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k'
check_directory(dir_1)
dir_2 = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/sparse_matrices_for_word2vecf'
check_directory(dir_2)
dir_3 = '/dlabdata1/youtube_large/jouven/channels_more_10k/users_index_mapping'
check_directory(dir_3)


# Create the training file where each line corresponds to a (word, context) = (channel, user) pairs for word2vecf
f = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/training_data", "w")
f.close()
train = ""
train_temp = ""
training_idx = 1


# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) >= 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            channel_idx = dict_channel_ind[vid_to_channels[line_split[2]]]
            
            if author_id == user:
            
                if author_id in duplicate_users:
                    if duplicate_users[author_id] <= 1:
                        update_user_channel_occurences(user_channels_occurences, channel_idx, user_idx)
                        train_temp += str(channel_idx) + " " + str(user_idx) + "\n"
                        training_idx += 1
                else:
                    update_user_channel_occurences(user_channels_occurences, channel_idx, user_idx)
                    train_temp += str(channel_idx) + " " + str(user_idx) + "\n"
                    training_idx += 1
            else:
                if len(user_channels_occurences) >= 1:
                    # Update the dictionnary where data is stored
                    graph_dict.update(user_channels_occurences)
                
                user_channels_occurences = {}
                
                if author_id in duplicate_users:
                    duplicate_users[author_id] += 1
                    if duplicate_users[author_id] <= 1:
                        user_idx += 1
                        update_user_channel_occurences(user_channels_occurences, channel_idx, user_idx)
                        train_temp += str(channel_idx) + " " + str(user_idx) + "\n"
                        training_idx += 1
                        
                else:
                    user_idx += 1
                    update_user_channel_occurences(user_channels_occurences, channel_idx, user_idx)
                    train_temp += str(channel_idx) + " " + str(user_idx) + "\n"
                    training_idx += 1
                    
            user = author_id
            
        
    if training_idx % 10000 == 0:
        # For speed purpose
        train = train + train_temp
        train_temp = ""
        
    if training_idx % 75000000 == 0:
        f = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/training_data", "a")
        f.write(train)
        f.close()
        train = ""
        train_temp = ""
                
    if idx % 75000000 == 0:
        # For space requirements every 75 millions line create a dok matrix and
        # update it with the graph_dict dictionnary and then save it into csr format and then release memory
        graph_matrix = dok_matrix((nb_channels, nb_users), np.uint32)
        dict.update(graph_matrix, graph_dict)
        graph_dict = {}
        # Save sparse matrix
        scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/sparse_matrices_for_word2vecf/matrice' + str(nb) + '.npz', graph_matrix.tocsr())
        with open("/dlabdata1/youtube_large/jouven/sparse_matrix_construction/idx_bip.pkl",'wb') as f:
             pickle.dump([idx], f)
        f.close()
        graph_matrix = []
        nb += 1
        print('line number: ' + str(idx) + ' time: ' + str(time.time() - begin_time))
        begin_time = time.time()
        
    idx += 1
    

# Store graph
graph_matrix = dok_matrix((nb_channels, nb_users), np.uint32)
dict.update(graph_matrix, graph_dict)
graph_dict = {}
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/sparse_matrices_for_word2vecf/matrice' + str(nb) + '.npz', graph_matrix.tocsc())
graph_matrix = []

# Store (channel, user) data into the training file
f = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/training_data", "a")
f.write(train)
f.close()

line number: 75000000 time: 541.7293963432312
line number: 150000000 time: 767.2881526947021


Lot of small sparse matrices have been produced because of space requirements. The final sparse matrice has to be generated from the sum of all sparse matrices constructed beforehand.

In [None]:
# Numbers of channels
nb_channels = len(channels_id)


graph = dok_matrix((nb_channels, nb_users), dtype = np.uint32).tocsr()
path = '/dlabdata1/youtube_large/jouven/sparse_matrix_construction/channels_more_10k/word2vecf/'
files = glob.glob(path + '*.npz')
for file in files: 
    graph += scipy.sparse.load_npz(file).astype(np.uint32)
    
# Save the final sparse matrix
scipy.sparse.save_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_10k/word2vecf/sparse_matrix_for_word2vec.npz', graph)