## NOTEBOOK DESCRIPTION:

This notebook builds the training data for the word2vec_pytorch implementation. 
It builds the training data by selecting ALL the channels from a user, performing all the combinations between these channels and at end randomly selecting randomly pairs from the combinations pairs

WARNING: Before running this notebook, config.py needs to be filled with the wanted parameters.

In [10]:
import scipy.sparse
import sys
import os
import gzip
import random
import time
import itertools
import math

import pandas as pd
import numpy as np

from  config import *
from itertools import permutations, combinations

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

### Create training data from reading the comments dataset

In order to build the training data, the following code reads the orignal `comments_dataset`, as usual we process sequentially each user and puts the results into a Pandas DataFrame. 
The results need to have a specific format: every line of the DataFrame needs to contain pairs of channels corresponding to the (input, output) of a given user.
    
    For each user:
        - Select the channels that this user has commented
        - Perform subsampling if specified
        - Select ALL the channels this user has commented in
        - Perform the 2-combinations out of these channels
        - Randomly take CONTEXT! / (2 * (CONTEXT - 2)!) pairs from all the combinations

In [3]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = video_id_to_channel_id()

In [4]:
# Set of duplicate users
duplicate_users = dict_occurent_users()

In [5]:
# Channels with the selected comments
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [18]:
'''
This function performs ALL the combinations from the channels that a user has commented in and then 
selects random pairs from these pairs.

PARAMETERS:
    - data: List containing pairs of channels corresponding to the users already processed
    - user_channels: The list of channel a given user has commented in
'''

def create_pairs(data, user_channels, samples):
    if len(user_channels) > CONTEXT:
        all_comb = random.sample(list(itertools.combinations(user_channels, 2)), samples)
        for comb in all_comb:
            data.append((comb[0], comb[1]))
        all_comb = 0
    else:
        for comb in itertools.combinations(user_channels, 2):
            data.append((comb[0], comb[1]))

In [19]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS

# Dictionnary counting the number of time (channel_idx, channel2_idx) appears
data = []
# Indices
nb = 0
idx = 1
# Channels that a user have commented
user_channels = []
# Number of channels, Row and columns length of the sparse matrix
matrix_len = len(channels_id)
samples = int(math.factorial(CONTEXT) / (2*math.factorial(CONTEXT-2)))
print(samples)

user = ''
begin_time = time.time()

dir_1 = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_300'
check_directory(dir_1)

if SUBSAMPLING:
    print('performing subsampling ...')
    
    with open("/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_300/vocab_occ.pkl",'rb') as f:
        vocab_occ = pickle.load(f)
    f.close()
    total = np.sum(vocab_occ) # Total number of comments
    
    selected_channels = []
    for channel in range(len(vocab_occ)):
        frac = vocab_occ[channel]/total
        prob = 1 - np.sqrt(SAMPLING_RATE/frac)

        sampling = np.random.sample()
        if (sampling >= prob):
            selected_channels.append(channel)
        selected_channels = set(selected_channels)

print('Create training set ...')
# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) >= 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            corr_channel = dict_channel_ind[vid_to_channels[line_split[2]]]
            if author_id == user:
                # if user is a duplicate user
                if author_id in duplicate_users:
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
            else:
                if SUBSAMPLING:
                    user_channels = list(set(user_channels).intersection(selected_channels))
                else:
                    user_channels = list(set(user_channels))
                # We need at list 2 channels to build a line into the training set.
                if len(user_channels) >= 2:
                    create_pairs(data, user_channels, samples)
                user_channels = []
                
                if len(data) >= 50000000:
                    with open("/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_300/idx_td.pkl",'wb') as f:
                         pickle.dump([idx], f)
                    f.close()
                    df = pd.DataFrame(data)
                    if nb == 0:
                        df.to_csv(TRAINING_DATA_PATH, compression='gzip', index = False)
                    else:
                        df.to_csv(TRAINING_DATA_PATH, compression='gzip', mode='a', index = False, header = False)
                    nb += 1
                    data = []
                    df = 0
                    print('idx ' + str(idx))
                    print('nb ' + str(nb))
                    
                # If user is a duplicate user
                if author_id in duplicate_users:
                    duplicate_users[author_id] += 1
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
           
        user = author_id
    idx += 1
    
df = pd.DataFrame(data)
df.to_csv(TRAINING_DATA_PATH, compression='gzip', mode='a', index = False, header = False)
data = 0
df = 0

4950
idx 8518101
nb 1
idx 17018790
nb 2
idx 25551369
nb 3
idx 33939512
nb 4
idx 42439841
nb 5
idx 50818047
nb 6
idx 59216070
nb 7
idx 67604497
nb 8
idx 76020716
nb 9
idx 84372439
nb 10
idx 92785908
nb 11
idx 101211047
nb 12
idx 109625797
nb 13
idx 117934651
nb 14
idx 126327405
nb 15
idx 134680129
nb 16
idx 143084786
nb 17
idx 151757112
nb 18
idx 160203306
nb 19
idx 168789898
nb 20
idx 177183940
nb 21
idx 185802751
nb 22
idx 194229878
nb 23
idx 202771049
nb 24
idx 211116573
nb 25
idx 219537046
nb 26
idx 228038488
nb 27
idx 236546027
nb 28
idx 245133786
nb 29
idx 253656275
nb 30
idx 262089477
nb 31
idx 270540743
nb 32
idx 279078852
nb 33
idx 287523678
nb 34
idx 295979219
nb 35
idx 304510915
nb 36
idx 313051033
nb 37
idx 321485945
nb 38
idx 329900778
nb 39
idx 338395089
nb 40
idx 346949565
nb 41
idx 355417222
nb 42
idx 363824592
nb 43
idx 372241383
nb 44
idx 380661031
nb 45
idx 389115515
nb 46
idx 397544377
nb 47
idx 405850245
nb 48
idx 414438774
nb 49
idx 422784601
nb 50
idx 431181749
nb

idx 3251963700
nb 384
idx 3260331434
nb 385
idx 3268787721
nb 386
idx 3277316865
nb 387
idx 3285831497
nb 388
idx 3294337409
nb 389
idx 3302708175
nb 390
idx 3311247546
nb 391
idx 3319749399
nb 392
idx 3328120514
nb 393
idx 3336482167
nb 394
idx 3344965877
nb 395
idx 3353308154
nb 396
idx 3361723038
nb 397
idx 3370099125
nb 398
idx 3378536809
nb 399
idx 3387058340
nb 400
idx 3395532775
nb 401
idx 3404076928
nb 402
idx 3412524634
nb 403
idx 3420878302
nb 404
idx 3429451636
nb 405
idx 3437913771
nb 406
idx 3446310535
nb 407
idx 3454715838
nb 408
idx 3463232981
nb 409
idx 3471711931
nb 410
idx 3480253348
nb 411
idx 3488732709
nb 412
idx 3497235135
nb 413
idx 3505705662
nb 414
idx 3514067585
nb 415
idx 3522721467
nb 416
idx 3531034282
nb 417
idx 3539418145
nb 418
idx 3547877760
nb 419
idx 3556446317
nb 420
idx 3564855275
nb 421
idx 3573317064
nb 422
idx 3581912352
nb 423
idx 3590387941
nb 424
idx 3598733154
nb 425
idx 3607201631
nb 426
idx 3615730903
nb 427
idx 3624199179
nb 428
idx 363276

idx 6409158587
nb 757
idx 6417508616
nb 758
idx 6426060908
nb 759
idx 6434530836
nb 760
idx 6443048864
nb 761
idx 6451457909
nb 762
idx 6460065923
nb 763
idx 6468471943
nb 764
idx 6476979895
nb 765
idx 6485461143
nb 766
idx 6493963219
nb 767
idx 6502495869
nb 768
idx 6510895651
nb 769
idx 6519379175
nb 770
idx 6527699004
nb 771
idx 6536263234
nb 772
idx 6544754250
nb 773
idx 6553308073
nb 774
idx 6561767248
nb 775
idx 6570402518
nb 776
idx 6578846014
nb 777
idx 6587329836
nb 778
idx 6595685947
nb 779
idx 6604180740
nb 780
idx 6612768919
nb 781
idx 6621190852
nb 782
idx 6629821489
nb 783
idx 6638260983
nb 784
idx 6646625941
nb 785
idx 6655207698
nb 786
idx 6663675661
nb 787
idx 6671991518
nb 788
idx 6680431531
nb 789
idx 6688825414
nb 790
idx 6697248620
nb 791
idx 6705837467
nb 792
idx 6714406487
nb 793
idx 6722906349
nb 794
idx 6731379961
nb 795
idx 6739847339
nb 796
idx 6748218445
nb 797
idx 6756677600
nb 798
idx 6765155135
nb 799
idx 6773572660
nb 800
idx 6782057640
nb 801
idx 679038

idx 9517609643
nb 1124
idx 9526138688
nb 1125
idx 9534673416
nb 1126
idx 9543141581
nb 1127
idx 9551642725
nb 1128
idx 9560008150
nb 1129
idx 9568598195
nb 1130
idx 9577034912
nb 1131
idx 9585657982
nb 1132
idx 9594122432
nb 1133
idx 9602553943
nb 1134
idx 9611052747
nb 1135
idx 9619390544
nb 1136
idx 9627822386
nb 1137
idx 9636366633
nb 1138
idx 9644826985
nb 1139
idx 9653270070
nb 1140
idx 9661752458
nb 1141
idx 9670008478
nb 1142
idx 9678514554
nb 1143
idx 9686905054
nb 1144
idx 9695398701
nb 1145
idx 9703866144
nb 1146
idx 9712310100
nb 1147
idx 9720641996
nb 1148
idx 9729185666
nb 1149
idx 9737556894
nb 1150
idx 9745957421
nb 1151
idx 9754484187
nb 1152
idx 9762899082
nb 1153
idx 9771338085
nb 1154
idx 9779734341
nb 1155
idx 9788132453
nb 1156
idx 9796664579
nb 1157
idx 9805206812
nb 1158
idx 9813611810
nb 1159
idx 9822169543
nb 1160
idx 9830549706
nb 1161
idx 9839058609
nb 1162
idx 9847568148
nb 1163
idx 9856065649
nb 1164
idx 9864656990
nb 1165
idx 9872987388
nb 1166
idx 9881503