## NOTEBOOK DECRIPTION:

This notebook creates new data/files according to the THRESHOLD value which the minimum number of comments per channel.
This THRESHOLD value can be modified in `/helpers/config_threshold_value.py`.
This files created are:
- Filter the video-channel relationship dataframe with the selected channels
- The set of users that appeared multiple times in the `comments_dataset` file
- Compute the channel-user sparse matrix for the given THRESHOLD
- Compute the pairs of channels for the user and random jumper

In [1]:
import time
import pickle
import os
import sys
import scipy.sparse

import numpy as np
import pandas as pd

from config_threshold_value import *

In [2]:
'''
This function creates a mapping between the selected channels id and their corresponding index.

RETURN:
    - dict_channel_ind: dictonnary mapping the channels id to the channels index
    - dict_ind_channel: dictionnary mapping the channels index to the channels id
    - channels_id: List of selected channels id
'''
def filtered_channels_index_id_mapping():
    with open(os.path.join(COMMON_PATH, "channels_more_" + THRESHOLD_NAME + ".pkl"),'rb') as f:
        channels_id = pickle.load(f)
    f.close()
    
    channels_id = sorted(channels_id)
    # Dictionnary mapping the channel id to an integer corresponding to the row of the sparse matrix.
    dict_channel_ind = {}
    dict_ind_channel = {}
    for ind, channel_id in enumerate(channels_id):
        dict_channel_ind[channel_id] = ind
        dict_ind_channel[ind] = channel_id
    channels_id = set(channels_id)
    return dict_channel_ind, dict_ind_channel, channels_id

### Video channel mapping filtered

From the original dictionnary having all the video-channel relationship, we only select data corresponding to the selected channels.

In [3]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [4]:
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/jouven/channels_more_300/video_to_channel_mapping_filtered.pkl")

In [5]:
print('Original length of the relationship dataframe ', len(vid_to_channels))

Original length of the relationship dataframe  70982015


In [6]:
vid_to_channels_filtered = {}
for vid, channel in vid_to_channels.items():
    if not dict_channel_ind.get(channel) == None:
        if channel in channels_id:
            vid_to_channels_filtered[vid] = channel

In [7]:
print('Filtered length of the relationship dataframe ', len(vid_to_channels_filtered))

Filtered length of the relationship dataframe  25108956


In [8]:
# Store the  video_id to the channel index filtered mapping
with open(os.path.join(COMMON_PATH, "video_to_channel_mapping_filtered.pkl"),'wb') as f:
     pickle.dump(vid_to_channels_filtered, f)
f.close()

In [9]:
# Release memory
vid_to_channels = 0
vid_to_channels = vid_to_channels_filtered
vid_to_channels_filtered = 0

### Compute duplicate users and compute the occurences of each channel

Some users appear to have duplicate rows and the goal is to find these duplicate rows to delete them when reading the `comments_dataset` file.

In order to find these duplicate users, we first read the whole `comments_dataset` file and retrieve each user for each block of comments. Since this file is ordered by user, each time we encounter a different user we put it in the created dataframe.

In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=160384)

# parameters
idx = 1
comments_per_channel = {}
user = ''
begin_time = time.time()
# Users having commented
users = []
nb = 0

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) >= 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            if author_id != user:
                users.append(author_id)
            
    if len(users) >= 50000000:
        print(str(idx) + ' line have been processed')
        with open("/dlabdata1/youtube_large/jouven/idx.pkl",'wb') as f:
             pickle.dump([idx], f)
        f.close()
        df = pd.DataFrame({'users': users})
        if nb == 0:
            df.to_csv(os.path.join(COMMON_PATH, "users.csv.gz"), compression='gzip', index = False)
        else:
            df.to_csv(os.path.join(COMMON_PATH, "users.csv.gz"), compression='gzip', mode='a', index = False, header = False)
        nb += 1
        df = 0
        users = []
        
    user = author_id
    idx += 1
    
print(str(idx) + ' line have been processed')
with open("/dlabdata1/youtube_large/jouven/idx.pkl",'wb') as f:
     pickle.dump([idx], f)
f.close()
df = pd.DataFrame({'users': users})
if nb == 0:
    df.to_csv(os.path.join(COMMON_PATH, "users.csv.gz"), compression='gzip', index = False)
else:
    df.to_csv(os.path.join(COMMON_PATH, "users.csv.gz"), compression='gzip', mode='a', index = False, header = False)
nb += 1
df = 0
users = []

In [None]:
users = pd.read_csv(os.path.join(COMMON_PATH, "users.csv.gz", compression='gzip')

In [None]:
nb_users = len(users.drop_duplicates())

In [None]:
print('Number of users ', nb_users)

In [None]:
with open(os.path.join(COMMON_PATH, "nb_users.pkl"),'wb') as f:
     pickle.dump([nb_users], f)
f.close()

In [None]:
duplicate_users = users[users.duplicated() == True].drop_duplicates()

In [None]:
print('Number of users having duplicate rows =  ', users - nb_users)

In [None]:
with open(os.path.join(COMMON_PATH, "duplicate_users.pkl"),'wb') as f:
     pickle.dump(list(duplicate_users['users']), f)
f.close()

In [None]:
# Release memory
users = 0

## Compute the channel-user sparse matrix for the given THRESHOLD 
#### ps: The THRESHOLD value has to be greater than 300

In [4]:
'''
This function creates a mapping between the english channels id having more than 300 comments
and their corresponding index.

RETURN:
    - dict_channel_ind_300: dictonnary mapping the channels id to the channels index
    - dict_ind_channel_300: dictionnary mapping the channels index to the channels id
    - channels_id_300: List of selected channels id
'''
def channels_index_id_mapping_300():
    with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_more_300.pkl",'rb') as f:
        channels_id = pickle.load(f)
    f.close()
    
    channels_id = sorted(channels_id)
    # Dictionnary mapping the channel id to an integer corresponding to the row of the sparse matrix.
    dict_channel_ind = {}
    dict_ind_channel = {}
    for ind, channel_id in enumerate(channels_id):
        dict_channel_ind[channel_id] = ind
        dict_ind_channel[ind] = channel_id
    channels_id = set(channels_id)
    return dict_channel_ind, dict_ind_channel, channels_id

In [5]:
# Get the mapping between index/id for our baseline -> when we have more than 300 comments per channel
dict_channel_ind_300, dict_ind_channel_300, channels_id_300 = channels_index_id_mapping_300()

In [6]:
# Load the channel-user sparse matrix (only users having more than 2 comments)
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_word2vec_users_commented_geq_2_channels.npz')


In [7]:
# Select the channels indices of the large sparse matrix contained in the new set of selected channels
selected_channels = []
for channel in channels_id:
    selected_channels.append(dict_channel_ind_300[channel])
selected_channels = sorted(selected_channels)

In [8]:
# Select the channels from the sparse matrix
S = S[selected_channels]

In [9]:
# Select the users having more than 2 comments
remaining_users = np.where(np.array(S.getnnz(axis = 0)) >= 2)[0]

In [10]:
S = S.tocsc()
S = S.T
S = S[remaining_users]
S = S.T

In [11]:
scipy.sparse.save_npz(os.path.join(COMMON_PATH, "sparse_matrix_word2vec_users_commented_geq_2_channels.npz"), S.tocsr())

In [12]:
# Release memory
S = 0

## Compute the pairs of channels for the user and random jumper

In [13]:
NB_SAMPLE = 3000

# Load the channel-user sparse matrix
S = scipy.sparse.load_npz(os.path.join(COMMON_PATH, "sparse_matrix_word2vec_users_commented_geq_2_channels.npz"))


In [16]:
COMMON_PATH

'/dlabdata1/youtube_large/jouven/channels_more_haha'

In [21]:
vocab_path = "/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_" + THRESHOLD_NAME 
if not os.path.exists(vocab_path): 
    os.makedirs(vocab_path)

# Create and store the channels occurences
vocab_occ = np.array(S.sum(axis = 1).flatten().tolist()[0])  # For each channel count it's occurences
with open(vocab_path+"/vocab_occ.pkl",'wb') as f:
    pickle.dump(vocab_occ, f)
f.close()

Compute the 3000 pairs of channels with the new random jumper method.
For every pair, we take 2 users at random and then for each user we select one channel at random

In [23]:
selected_users = np.random.choice(np.arange(S.shape[1]), NB_SAMPLE, replace=False)
S2 = S[:, selected_users]

# Create and store channels tuples
channels_tuple = []
last = 0
for i in range(S2.shape[1]):
    idx = S2[:, i].nonzero()
    idx = idx[0]
    
    if i % 2 == 1:
        selected_channel = np.random.choice(idx, 1)
        channels_tuple.append((last, selected_channel))
    else:
        last = np.random.choice(idx, 1)
    
with open(os.path.join(COMMON_PATH, "channels_tuple_random_walk_new.pkl"),'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()

# release memory
selected_users = 0
channels_tuple = 0

Construction of the random pairs of channels to compute the random jumper of a model.
From the set of channels that we have, we select a pair of channels by selecting at random 2 channels.

In [25]:
np.random.choice(np.arange(S.shape[1]), size = (NB_SAMPLE, 2), replace=False)

array([[ 66079680, 131009717],
       [ 60453154,  29494171],
       [ 56578769,  59018842],
       ...,
       [178349367,  21225481],
       [ 61119565,   7111425],
       [ 67415177, 163507761]])

In [27]:
selected_channels = np.random.choice(np.arange(S.shape[1]), size = (NB_SAMPLE, 2), replace=False)
channels_tuple = []
for val in selected_channels:
    channels_tuple.append((val[0], val[1]))
with open(os.path.join(COMMON_PATH, 'channels_tuple_random_walk.pkl'),'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()
# Release memory
selected_channels = 0
channels_tuple = 0

Construction of the random pairs to compute the user jumper of a model.
We select a pair of channel by selecting a user at random and then selecting random 2 channels from this user.

In [34]:
# Sample NB_SAMPLE from the set of users
#selected_users = np.random.randint(S.shape[1] - 1, size=NB_SAMPLE)
#S = S[:, selected_users]

# Create and store channels tuples
channels_tuple = []
for i in range(S.shape[1]):
    idx = S[:, i].nonzero()
    idx = idx[0]
    selected_channels = np.random.choice(idx, 2, replace=False)
    channels_tuple.append((selected_channels[0], selected_channels[1]))
    
with open(os.path.join(COMMON_PATH, "channels_tuple_user_walk.pkl"),'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()

