## NOTEBOOK DESCRIPTION:

This notebook builds the training data for the word2vec_pytorch implementation. 
It builds the training data by selecting ALL the channels from a user, performing all the combinations between these channels, giving a weight the pairs of channels at end randomly selecting randomly pairs from the combinations pairs according to the weighting scheme.

WARNING: Before running this notebook, `/word2vecf/config.py` needs to be filled with the wanted parameters(THRESHOLD_NAME) corresponding to the minimum number of comments per channel

In [7]:
import scipy.sparse
import sys
import os
import gzip
import random
import time
import itertools
import math

import pandas as pd
import numpy as np

from itertools import permutations, combinations
from collections import Counter

scriptpath = "/home/jouven/youtube_projects/word2vec_pytorch/"
sys.path.append(os.path.abspath(scriptpath))
from config import *

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

### Create training data from reading the comments dataset

In order to build the training data, the following code reads the orignal `comments_dataset`, as usual we process sequentially each user and puts the results into a Pandas DataFrame. 
The results need to have a specific format: every line of the DataFrame needs to contain pairs of channels corresponding to the (input, output) of a given user.
    
    For each user:
        - Select the channels that this user has commented
        - Perform subsampling if specified
        - Select ALL the channels this user has commented in
        - Perform the 2-combinations out of these channels
        - Gives weight = 1/log(occurences (channel_1) * occurences (channel_2)) to all pairs
        - Randomly select CONTEXT! / (2 * (CONTEXT - 2)!) pairs from all the combinations according to the weighting scheme defined above

In [8]:
COMMON_PATH = "/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_" + THRESHOLD_NAME

In [9]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = video_id_to_channel_id()

In [10]:
# Set of duplicate users
duplicate_users = dict_occurent_users()

In [11]:
# Channels with the selected comments
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [16]:
CONTEXT_SIZE = 100

In [17]:
'''
This function performs ALL the combinations from the channels that a user has commented in and then 
selects random pairs from these pairs according to a weighting scheme.

PARAMETERS:
    - data: List containing pairs of channels corresponding to the users already processed
    - user_channels: The list of channel a given user has commented in
'''

def create_pairs(data, user_channels, samples):
    occ = Counter(user_channels)
    user_channels = list(set(user_channels))
    comb = np.array(list(itertools.combinations(user_channels, 2)))
    
    if len(user_channels) > CONTEXT_SIZE:
        weights_sum = 0
        weights = []
        indices = []
        for ind, sample in enumerate(comb):
            # Weight given to the pairs
            weight = 1/math.log((1+occ[sample[0]])*(1+occ[sample[1]]))
            weights.append(weight)
            indices.append(ind)
            weights_sum += weight
        indices = np.random.choice(
                indices,
                p=np.array(weights)/weights_sum,
                size=samples,
                replace=False)
        comb = list(comb[indices])
    else:
        comb = list(itertools.combinations(user_channels, 2))
    data += comb

In [None]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS

# Dictionnary counting the number of time (channel_idx, channel2_idx) appears
data = []
# Indices
nb = 0
idx = 1
# Channels that a user have commented
user_channels = []
# Number of channels, Row and columns length of the sparse matrix
matrix_len = len(channels_id)
# Number of samples corresponding to the number of combinations for CONTEXT_SIZE channels.
samples = int(math.factorial(CONTEXT_SIZE) / (2*math.factorial(CONTEXT_SIZE-2)))
print(samples)

# Create directory if not existing
check_directory(COMMON_DLAB_PATH)

user = ''
begin_time = time.time()


if SUBSAMPLING:
    print('performing subsampling ...')
    
    with open(os.path.join(COMMON_PATH, "vocab_occ.pkl"),'rb') as f:
        vocab_occ = pickle.load(f)
    f.close()
    total = np.sum(vocab_occ) # Total number of comments
    
    selected_channels = []
    for channel in range(len(vocab_occ)):
        frac = vocab_occ[channel]/total
        prob = 1 - np.sqrt(SAMPLING_RATE/frac)

        sampling = np.random.sample()
        if (sampling >= prob):
            selected_channels.append(channel)
        selected_channels = set(selected_channels)

print('Create training set ...')
# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) >= 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            corr_channel = dict_channel_ind[vid_to_channels[line_split[2]]]
            if author_id == user:
                # if user is a duplicate user
                if author_id in duplicate_users:
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
            else:
                if SUBSAMPLING:
                    user_channels = list(set(user_channels).intersection(selected_channels))
                else:
                    user_channels = list(set(user_channels))
                # We need at list 2 channels to build a line into the training set.
                if len(user_channels) >= 2:
                    create_pairs(data, user_channels, samples)
                user_channels = []
                
                if len(data) >= 50000000:
                    df = pd.DataFrame(data)
                    if nb == 0:
                        df.to_csv(TRAINING_DATA_PATH, compression='gzip', index = False)
                    else:
                        df.to_csv(TRAINING_DATA_PATH, compression='gzip', mode='a', index = False, header = False)
                    nb += 1
                    data = []
                    df = 0
                    print('idx ' + str(idx))
                    print('nb ' + str(nb))
                    
                # If user is a duplicate user
                if author_id in duplicate_users:
                    duplicate_users[author_id] += 1
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
           
        user = author_id
    idx += 1
    
df = pd.DataFrame(data)
df.to_csv(TRAINING_DATA_PATH, compression='gzip', mode='a', index = False, header = False)
data = 0
df = 0

4950
idx 8518101
nb 1
idx 17018790
nb 2
idx 25551369
nb 3
idx 33939512
nb 4
idx 42439841
nb 5
idx 50818047
nb 6
idx 59216070
nb 7
idx 67604497
nb 8
idx 76020716
nb 9
idx 84372439
nb 10
idx 92785908
nb 11
idx 101211047
nb 12
idx 109625797
nb 13
idx 117934651
nb 14
idx 126327405
nb 15
idx 134680129
nb 16
idx 143084786
nb 17
idx 151757112
nb 18
idx 160203306
nb 19
idx 168789898
nb 20
idx 177183940
nb 21
idx 185802751
nb 22
idx 194229878
nb 23
idx 202771049
nb 24
idx 211116573
nb 25
idx 219537046
nb 26
idx 228038488
nb 27
idx 236546027
nb 28
idx 245133786
nb 29
idx 253656275
nb 30
idx 262089477
nb 31
idx 270540743
nb 32
idx 279078852
nb 33
idx 287523678
nb 34
idx 295979219
nb 35
idx 304510915
nb 36
idx 313051033
nb 37
idx 321485945
nb 38
idx 329900778
nb 39
idx 338395089
nb 40
idx 346949565
nb 41
idx 355417222
nb 42
idx 363824592
nb 43
idx 372241383
nb 44
idx 380661031
nb 45
idx 389115515
nb 46
idx 397544377
nb 47
idx 405850245
nb 48
idx 414438774
nb 49
idx 422784601
nb 50
idx 431181749
nb