In [1]:
import scipy.sparse
import sys
import os

import numpy as np

scriptpath = "../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

In [2]:
NB_SELECTED_USERS = 30000000

In [None]:
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_for_word2vec.npz')

In [None]:
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_user_walk.pkl",'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [None]:
for ref, sec in channels_tuple:
    P = S[[ref, sec]]
    if len(np.where(P.getnnz(axis = 0) >= 0)[0]) == 0:
        print('No entry for the given tuple')
        break

In [3]:
'''
Load the data
Select the NB_SELECTED_USERS having the most comments on different channels
PATEMETER:
    S: sparse matrix containing the (channels, users) data
RETURN:
    - Sparse matrix containing only the selected users
    - The indices of the set of selected_users
'''
def filter_users_by_number_of_comments_in_different_channels():
    # Load the channel tuple sparse matrix
    S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_for_word2vec.npz')
    comments_on_different_channels = S.getnnz(axis = 0)
    selected_users = comments_on_different_channels.argsort()[len(comments_on_different_channels) - NB_SELECTED_USERS:]
    return S[:, selected_users], selected_users

'''
Load the data
Select the NB_SELECTED_USERS having the most comments
PATEMETER:
    S: sparse matrix containing the (channels, users) data
RETURN:
    - Sparse matrix containing only the selected users
    - The indices of the set of selected_users
'''
def filter_users_by_number_of_comments():
    # Load the channel tuple sparse matrix
    S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_for_word2vec.npz')
    S = S.tocsc()
    users_occurences = np.array(S.sum(axis = 0).flatten().tolist()[0])  # sum the columns
    selected_users = users_occurences.argsort()[len(users_occurences) - 2*NB_SELECTED_USERS:NB_SELECTED_USERS]
    return S[:, selected_users], selected_users

def filter_users_by_number_comments_and_number_of_comments_in_different_channels():
    
    THRESHOLD = 20
    # Load the channel tuple sparse matrix
    S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_for_word2vec.npz')
    
    users_occurences = S.getnnz(axis = 0)
    selected_users = set(np.where(users_occurences >= THRESHOLD)[0])
    
    users_occurences = np.array(S.sum(axis = 0).flatten().tolist()[0]).argsort()  # sum the columns and sort
    
    final_user_selection = []
    for val in reversed(users_occurences):
        if val in selected_users:
            final_user_selection.append(val)
    
    #selected_users = list(users_occurences.intersection(selected_users))[len(selected_users) - NB_SELECTED_USERS:]
    
    S = S.tocsc()
    S = S.T
    S = S[selected_users]
    S = S.T
    return S, selected_users
    

In [4]:
'''
Create a text file named word_vocabulary, the word here corresponds to each channel
Each line corresponds to a (channel, # occurences) pair and written as
"channel_index #occurences" in the file
'''
def create_word_vocabulary(S, SAVING_PATH):
    word_pair = ""
    occurences_tab = np.array(S.sum(axis = 1).flatten().tolist()[0])  # sum the rows
    occurences_tab_ind = occurences_tab.argsort()
    for ind in occurences_tab_ind:
        #ind represents the channel index
        word_pair += str(ind) + " " + str(occurences_tab[ind]) + "\n"
    print('Writing into word_vocabulary...')
    f = open(SAVING_PATH, "w")
    f.write(word_pair)
    f.close()
        
    

In [5]:
'''
Create a text file named context_vocabulary, the context here is the uset
Each line corresponds to a (user, # occurences) pair and written as
"user_index #occurences" in the file
'''
def create_context_vocabulary(S, SAVING_PATH):
    
    context_pair = ""
    # Temporary variable for speed purpose
    context_pair_temp = ""
    
    occurences_tab = np.array(S.sum(axis = 0).flatten().tolist()[0])  # sum the columns
    # Select the 30M users that commented the most
    occurences_tab_ind = occurences_tab.argsort()
    i = 0
    for ind in occurences_tab_ind:
        context_pair_temp += str(ind) + " " + str(occurences_tab[ind]) + "\n"
        if i % 10000 == 0 and i > 0:
            context_pair += context_pair_temp
            context_pair_temp = ""
        i += 1
        
    occurences_tab = []
    
    context_pair += context_pair_temp
    context_pair_temp = ""
    print('Writing into context_vocabulary...')
    f = open(SAVING_PATH, "w")
    f.write(context_pair)
    f.close()
    

In [6]:
'''
From the already exsiting text file of channel-user pairs, we select only lines containing
users in selected_users to create a new training_file for word2vecf
PARAMETERS:
    - The set of selected users
'''
def create_channels_users_pairs(selected_users):
    # Make selected_users 
    selected_users = set(selected_users)
    
    f2 = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/training_data", "w")
    f2.close()
    
    f = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/training_data", 'r') 
    i = 0
    new_training_data = ''
    new_training_data_temp = ''
    while True: 
        # Get next line from file 
        line = f.readline() 

        # if line is empty 
        # end of file is reached 
        if not line: 
            break
            
        # Select the channels-users pairs where the user is in the set of selected users
        if int(line.split(' ')[1].split('\n')[0]) in selected_users:
            new_training_data_temp += line
            
        i += 1
        
        if i % 100000 == 0:
            new_training_data += new_training_data_temp
            new_training_data_temp = ""
            
        if i % 100000000 == 0:
            f2 = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/training_data", "a")
            f2.write(new_training_data)
            f2.close()
            new_training_data = ""
            new_training_data_temp = ""
            print(str(100000000) + ' lines have been processed')
            
    f.close() 
    f2 = open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/training_data", "a")
    f2.write(new_training_data)
    f2.close()
    new_training_data = ""

In [7]:
# Load the channel tuple sparse matrix
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_for_word2vec.npz')


In [8]:
THRESHOLD = 20
users_occurences = S.getnnz(axis = 0)
selected_users = np.where(users_occurences >= THRESHOLD)[0]

In [9]:
S = S.tocsc()
S = S.T
S = S[selected_users]
S = S.T

In [None]:
#S, selected_users = filter_users_by_number_comments_and_number_of_comments_in_different_channels()

In [10]:
create_context_vocabulary(S, "/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/context_vocabulary")


Writing into context_vocabulary...


In [11]:
create_word_vocabulary(S, "/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/word_vocabulary")


Writing into word_vocabulary...


In [12]:
S = []
create_channels_users_pairs(selected_users)

100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been processed
100000000 lines have been pr

In [26]:
count = 0
with open("/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_10/word_vocabulary") as fp: 
    while True: 
        count += 1
        line = fp.readline() 
        print(line)
        if count == 20:
            break
  

0 0

86418 0

86417 0

86416 0

86415 0

86414 0

86413 0

86412 0

86411 0

86410 0

86409 0

86408 0

86407 0

86406 0

86405 0

86404 0

86403 0

86402 0

86401 0

86400 0



In [13]:
haha = [1, 2]

In [15]:
haha

[1, 2]

In [12]:
haha = [1, 2]

In [29]:
ha = [4, 2, 3, 1, 5]

In [30]:
hi = [4, 1, 5, 8]

In [31]:
np.array(list(set(ha).intersection(set(hi))))

array([1, 4, 5])

In [15]:
len(selected_users)

30000000

In [16]:
with open("/dlabdata1/youtube_large/jouven/channels_more_300/selected_users_word2vecf.pkl",'wb') as f:
     pickle.dump(selected_users, f)
f.close()

In [19]:
selected_users

[204653196,
 204653197,
 70435470,
 338870929,
 338870931,
 70435476,
 70435481,
 204653211,
 70435484,
 204653214,
 204653223,
 338870952,
 204653225,
 204653226,
 338870953,
 70435500,
 204653228,
 338870954,
 338870956,
 338870960,
 338870961,
 204653237,
 204653243,
 204653246,
 338870975,
 204653248,
 338870976,
 204653254,
 70435531,
 70435533,
 338870993,
 338871000,
 204653273,
 338871004,
 204653279,
 70435553,
 338871011,
 338871012,
 204653286,
 338871015,
 70435560,
 338871016,
 70435565,
 338871022,
 338871023,
 204653298,
 204653301,
 338871033,
 70435578,
 204653306,
 204653307,
 204653310,
 338871046,
 204653319,
 338871048,
 338871050,
 338871051,
 70435596,
 204653325,
 204653329,
 204653335,
 70435610,
 338871066,
 70435612,
 70435613,
 70435614,
 204653343,
 338871074,
 70435620,
 204653352,
 70435625,
 338871082,
 338871084,
 338871091,
 70435636,
 338871093,
 70435639,
 338871097,
 338871098,
 204653373,
 70435646,
 204653374,
 204653376,
 70435649,
 70435650,
 70