## NOTEBOOK DESCRIPTION:

This notebook builds the training data for the word2vec_pytorch implementation. 
It builds the training data by selecting at most CONTEXT channels from a user and then performing all the combinations between the selected channels

WARNING: Before running this notebook, config.py needs to be filled with the wanted parameters.

In [5]:
import scipy.sparse
import sys
import os
import gzip
import random
import time
import itertools

import pandas as pd
import numpy as np

from  config import *
from itertools import permutations, combinations

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

### Create training data from reader the comments dataset

In order to build the training data, the following code reads the orignal `comments_dataset`, as usual we process sequentially each user and puts the results into a Pandas DataFrame. 
The results need to have a specific format: every line of the DataFrame needs to contain pairs of channels corresponding to the (input, output) of a given user.
    
    For each user:
        - Select the channels that this user has commented
        - Perform subsampling if specified
        - Select at maximum CONTEXT channels from the set of channels this user has commented 
        - Perform the 2-combinations out of the select channels

In [2]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = video_id_to_channel_id()

In [3]:
# Set of duplicate users
duplicate_users = dict_occurent_users()

In [4]:
# Channels with the selected comments
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [7]:
'''
This function selects at maximum CONTEXT channels from the set of channels this user has commented in, performs
the 2-combinations out the selected channels and append the results to the data.

PARAMETERS:
    - data: List containing pairs of channels corresponding to the users already processed
    - user_channels: The list of channel a given user has commented in
'''
def create_pairs(data, user_channels):
    if len(user_channels) > CONTEXT:
        user_channels = random.sample(user_channels, CONTEXT)
    
    for comb in itertools.combinations(user_channels, 2):
        data.append((comb[0], comb[1]))

In [8]:
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# PARAMETERS

# Dictionnary counting the number of time (channel_idx, channel2_idx) appears
data = []
# Indices
nb = 0
idx = 1
# Channels that a user have commented
user_channels = []
# Number of channels, Row and columns length of the sparse matrix
matrix_len = len(channels_id)


user = ''
begin_time = time.time()

dir_1 = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_300'
check_directory(dir_1)

if SUBSAMPLING:
    print('performing subsampling ...')
    
    with open("/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_300/vocab_occ.pkl",'rb') as f:
        vocab_occ = pickle.load(f)
    f.close()
    total = np.sum(vocab_occ) # Total number of comments
    
    selected_channels = []
    for channel in range(len(vocab_occ)):
        frac = vocab_occ[channel]/total
        prob = 1 - np.sqrt(SAMPLING_RATE/frac)

        sampling = np.random.sample()
        if (sampling >= prob):
            selected_channels.append(channel)
        selected_channels = set(selected_channels)

print('Building vocab ...')
# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) >= 9:
        author_id = line_split[0]
        if vid_to_channels.get(line_split[2]) in channels_id:
            corr_channel = dict_channel_ind[vid_to_channels[line_split[2]]]
            if author_id == user:
                # if user is a duplicate user
                if author_id in duplicate_users:
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
            else:
                if SUBSAMPLING:
                    user_channels = list(set(user_channels).intersection(selected_channels))
                else:
                    user_channels = list(set(user_channels))
                # We need at list 2 channels to build a line into the training set.
                if len(user_channels) >= 2:
                    create_pairs(data, user_channels)
                user_channels = []
                
                # For memory purpose add results to the DataFrame
                if len(data) >= 50000000:
                    with open("/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_300/idx_td.pkl",'wb') as f:
                         pickle.dump([idx], f)
                    f.close()
                    df = pd.DataFrame(data)
                    if nb == 0:
                        df.to_csv(TRAINING_DATA_PATH, compression='gzip', index = False)
                    else:
                        df.to_csv(TRAINING_DATA_PATH, compression='gzip', mode='a', index = False, header = False)
                    nb += 1
                    data = []
                    df = 0
                    print('idx ' + str(idx))
                    print('nb ' + str(nb))
                    
                # If user is a duplicate user
                if author_id in duplicate_users:
                    duplicate_users[author_id] += 1
                    if duplicate_users[author_id] <= 1:
                        user_channels.append(corr_channel)
                else:
                    user_channels.append(corr_channel)
           
        user = author_id
    idx += 1
    
df = pd.DataFrame(data)
df.to_csv(TRAINING_DATA_PATH, compression='gzip', mode='a', index = False, header = False)
data = 0
df = 0

idx 15807501
nb 1
idx 31615495
nb 2
idx 47312225
nb 3
idx 63008350
nb 4
idx 78701083
nb 5
idx 94199660
nb 6
idx 109877182
nb 7
idx 125382526
nb 8
idx 140940324
nb 9
idx 156808438
nb 10
idx 172667287
nb 11
idx 188538126
nb 12
idx 204227010
nb 13
idx 219813183
nb 14
idx 235696847
nb 15
idx 251521792
nb 16
idx 267150174
nb 17
idx 282908463
nb 18
idx 298640862
nb 19
idx 314544957
nb 20
idx 330222074
nb 21
idx 346041874
nb 22
idx 361767557
nb 23
idx 377494676
nb 24
idx 393098809
nb 25
idx 408660532
nb 26
idx 424319330
nb 27
idx 439973877
nb 28
idx 455780636
nb 29
idx 471402748
nb 30
idx 487059754
nb 31
idx 502643632
nb 32
idx 518385775
nb 33
idx 534248333
nb 34
idx 549841652
nb 35
idx 565777958
nb 36
idx 581493724
nb 37
idx 597146513
nb 38
idx 612699445
nb 39
idx 628513216
nb 40
idx 643908297
nb 41
idx 659571874
nb 42
idx 675241371
nb 43
idx 691014118
nb 44
idx 706911095
nb 45
idx 722877986
nb 46
idx 738458209
nb 47
idx 754267836
nb 48
idx 769975780
nb 49
idx 785614289
nb 50
idx 801126440
n

idx 6016803896
nb 383
idx 6032513657
nb 384
idx 6048241554
nb 385
idx 6064127467
nb 386
idx 6079825530
nb 387
idx 6095542687
nb 388
idx 6111152281
nb 389
idx 6126825347
nb 390
idx 6142409232
nb 391
idx 6158251769
nb 392
idx 6173809011
nb 393
idx 6189519925
nb 394
idx 6205086130
nb 395
idx 6220667271
nb 396
idx 6236338418
nb 397
idx 6252025223
nb 398
idx 6268013231
nb 399
idx 6283855442
nb 400
idx 6299613334
nb 401
idx 6315327163
nb 402
idx 6330809564
nb 403
idx 6346421877
nb 404
idx 6362204405
nb 405
idx 6377868704
nb 406
idx 6393618442
nb 407
idx 6409577631
nb 408
idx 6425367104
nb 409
idx 6441152430
nb 410
idx 6456863074
nb 411
idx 6472487855
nb 412
idx 6488345505
nb 413
idx 6504153589
nb 414
idx 6519856165
nb 415
idx 6535536965
nb 416
idx 6551356320
nb 417
idx 6567145051
nb 418
idx 6582825254
nb 419
idx 6598433141
nb 420
idx 6614326375
nb 421
idx 6630064853
nb 422
idx 6645706602
nb 423
idx 6661492320
nb 424
idx 6677089940
nb 425
idx 6692708880
nb 426
idx 6708517858
nb 427
idx 672431