## NOTEBOOK DESCRIPTION:

This notebook creates the input files needed to run the word2vecf source code.

- training_data: Provide all pairs of channel-users present in the `comments_dataset`. For example, if user $user_i$ commented on $channel_i$ 3 times, the pair $(channel_i, user_i)$ would appear 3 times in the training data,
- word_vocabulary: Text file. For each channels, provide the number of comments commented on this channel,
- context_vocabulary: Text file. For each user, provide the number of commented has done.

In [1]:
import scipy.sparse
import sys
import os

import numpy as np

scriptpath = "../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_10k import *

In [2]:
# Select a maximum number of user because of the requirement from word2vecf source code
NB_SELECTED_USERS = 30000000

In [3]:
'''
Load and filter the data
Select the NB_SELECTED_USERS having the most comments on different channels

RETURN:
    - Sparse matrix containing only the selected users
    - The indices of the set of selected_users
'''
def filter_users_by_number_of_comments_in_different_channels():
    # Load the channel tuple sparse matrix
    S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_10k/word2vecf/sparse_matrix_for_word2vec.npz')
    comments_on_different_channels = S.getnnz(axis = 0)
    selected_users = comments_on_different_channels.argsort()[len(comments_on_different_channels) - NB_SELECTED_USERS:]
    return S[:, selected_users], selected_users

'''
Load and filter the data
Select the NB_SELECTED_USERS having the most comments

RETURN:
    - Sparse matrix containing only the selected users
    - The indices of the set of selected_users
'''
def filter_users_by_number_of_comments():
    # Load the channel tuple sparse matrix
    S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_10k/word2vecf/sparse_matrix_for_word2vec.npz')
    S = S.tocsc()
    users_occurences = np.array(S.sum(axis = 0).flatten().tolist()[0])  # sum the columns
    selected_users = users_occurences.argsort()[len(users_occurences) - 2*NB_SELECTED_USERS:NB_SELECTED_USERS]
    return S[:, selected_users], selected_users

'''
Load and filter the data
Select the NB_SELECTED_USERS having the most comments and having more than THRESHOLD comments
on different channels

RETURN:
    - Sparse matrix containing only the selected users
    - The indices of the set of selected_users
'''
def filter_users_by_number_comments_and_number_of_comments_in_different_channels():
    
    THRESHOLD = 20
    # Load the channel tuple sparse matrix
    S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_10k/word2vecf/sparse_matrix_for_word2vec.npz')
    
    users_occurences = S.getnnz(axis = 0)
    selected_users = set(np.where(users_occurences >= THRESHOLD)[0])
    
    users_occurences = np.array(S.sum(axis = 0).flatten().tolist()[0]).argsort()  # sum the columns and sort
    
    final_user_selection = []
    for val in reversed(users_occurences):
        if val in selected_users:
            final_user_selection.append(val)
    
    #selected_users = list(users_occurences.intersection(selected_users))[len(selected_users) - NB_SELECTED_USERS:]
    
    S = S.tocsc()
    S = S.T
    S = S[selected_users]
    S = S.T
    return S, selected_users
    

In [4]:
'''
Create a text file named word_vocabulary, the word here corresponds to each channel
Each line corresponds to a (channel, # occurences) pair and written as
"channel_index #occurences" in the file
'''
def create_word_vocabulary(S, SAVING_PATH):
    word_pair = ""
    occurences_tab = np.array(S.sum(axis = 1).flatten().tolist()[0])  # sum the rows
    occurences_tab_ind = occurences_tab.argsort()
    for ind in occurences_tab_ind:
        #ind represents the channel index
        word_pair += str(ind) + " " + str(occurences_tab[ind]) + "\n"
    print('Writing into word_vocabulary...')
    f = open(SAVING_PATH, "w")
    f.write(word_pair)
    f.close()
        
    

In [5]:
'''
Create a text file named context_vocabulary, the context here is the uset
Each line corresponds to a (user, # occurences) pair and written as
"user_index #occurences" in the file
'''
def create_context_vocabulary(S, SAVING_PATH):
    
    context_pair = ""
    # Temporary variable for speed purpose
    context_pair_temp = ""
    
    occurences_tab = np.array(S.sum(axis = 0).flatten().tolist()[0])  # sum the columns
    # Select the 30M users that commented the most
    occurences_tab_ind = occurences_tab.argsort()
    i = 0
    for ind in occurences_tab_ind:
        context_pair_temp += str(ind) + " " + str(occurences_tab[ind]) + "\n"
        if i % 10000 == 0 and i > 0:
            context_pair += context_pair_temp
            context_pair_temp = ""
        i += 1
        
    occurences_tab = []
    
    context_pair += context_pair_temp
    context_pair_temp = ""
    print('Writing into context_vocabulary...')
    f = open(SAVING_PATH, "w")
    f.write(context_pair)
    f.close()
    

In [6]:
'''
From the already exsiting text file of channel-user pairs, we select only lines containing
users in selected_users to create a new training_file for word2vecf
PARAMETERS:
    - The set of selected users
'''
def create_channels_users_pairs(selected_users):
    # Make selected_users 
    selected_users = set(selected_users)
    
    f2 = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/size_200_sub_0043_neg_35_threshold_20/training_data", "w")
    f2.close()
    
    f = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/training_data", 'r') 
    i = 0
    new_training_data = ''
    new_training_data_temp = ''
    while True: 
        # Get next line from file 
        line = f.readline() 

        # if line is empty 
        # end of file is reached 
        if not line: 
            break
            
        # Select the channels-users pairs where the user is in the set of selected users
        if int(line.split(' ')[1].split('\n')[0]) in selected_users:
            new_training_data_temp += line
            
        i += 1
        
        if i % 100000 == 0:
            new_training_data += new_training_data_temp
            new_training_data_temp = ""
            
        if i % 100000000 == 0:
            f2 = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/size_200_sub_0043_neg_35_threshold_20/training_data", "a")
            f2.write(new_training_data)
            f2.close()
            new_training_data = ""
            new_training_data_temp = ""
            print(str(100000000) + ' lines have been processed')
            
    f.close() 
    f2 = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/size_200_sub_0043_neg_35_threshold_20/training_data", "a")
    f2.write(new_training_data)
    f2.close()
    new_training_data = ""

In [None]:
S, selected_users = filter_users_by_number_comments_and_number_of_comments_in_different_channels()

In [10]:
create_context_vocabulary(S, "/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/size_200_sub_0043_neg_35_threshold_20/context_vocabulary")


Writing into context_vocabulary...


In [11]:
create_word_vocabulary(S, "/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/size_200_sub_0043_neg_35_threshold_20/word_vocabulary")


Writing into word_vocabulary...


In [12]:
S = []
create_channels_users_pairs(selected_users)

100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been pr