In [1]:
from __future__ import print_function
from tqdm import tqdm
# from tqdm import tqdm_gui

import numpy as np
import pandas as pd
import sys, os, pickle, shutil

import torch
import torch.optim as optim
import torch.nn as nn

import time

# it is a little tricky on run SummaryWriter by installing a suitable version of pytorch. so 
# if you are able to import SummaryWriter from torch.utils.tensorboard, this script will record summaries. 
# Otherwise it will not.
try:
    from torch.utils.tensorboard import SummaryWriter
    write_summary = True
except:
    write_summary = False

from model import *
from utils_modified import count_parameters, q
from config import *

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

### Steps done before the training phase

In [2]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print('DEVICE: ', DEVICE)

DEVICE:  cpu


In [3]:
# Use only for test
if DATA_SOURCE == 'test':
    if os.path.exists(MODEL_DLAB_DIR):
        shutil.rmtree(MODEL_DLAB_DIR)
        
    if os.path.exists(MODEL_PERSO_DIR):
        shutil.rmtree(MODEL_PERSO_DIR)

In [4]:
# Create new directory for the experiment in the DLAB folder
check_directory(MODEL_DIR)
# Create new directory for the experiment in the personnal folder
check_directory(MODEL_PERSO_DIR)

# SUMMARY_DIR is the path of the directory where the tensorboard SummaryWriter files are written
if write_summary:
    if os.path.exists(SUMMARY_DIR):
        # the directory is removed, if it already exists
        shutil.rmtree(SUMMARY_DIR)

    writer = SummaryWriter(SUMMARY_DIR) # this command automatically creates the directory at SUMMARY_DIR
    summary_counter = 0

In [5]:
# Retrieve the number of occurences of each channel (vocabulary)
with open("/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_" + THRESHOLD_NAME + "/vocab_occ.pkl",'rb') as f:
    vocab_occ = pickle.load(f)
f.close()

In [6]:
print('len(vocab): ', len(vocab_occ))

len(vocab):  17716


In [7]:
# make noise distribution to sample negative examples from the words (channels)
word_freqs = np.array(vocab_occ)
unigram_dist = word_freqs/sum(word_freqs)
noise_dist = torch.from_numpy(unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))

### Training ...

In [9]:

losses = []

model = Word2Vec_neg_sampling(EMBEDDING_DIM, len(vocab_occ), DEVICE, noise_dist, NEGATIVE_SAMPLES).to(DEVICE)
print('\nWe have {} Million trainable parameters here in the model'.format(count_parameters(model)))

optimizer = optim.Adam(model.parameters(), lr = LR)

begin = time.time()

for epoch in tqdm(range(NUM_EPOCHS)):
    print('\n===== EPOCH {}/{} ====='.format(epoch + 1, NUM_EPOCHS))
    
    # model.train()
    batch_idx = 0
    for chunk in pd.read_csv(TRAINING_DATA_PATH, compression='gzip', chunksize = BATCH_SIZE):
        model.train()
        
        # For the use of the GPU
        x_batch = torch.tensor(list(chunk['0']), dtype = torch.long).to(DEVICE)
        y_batch = torch.tensor(list(chunk['1']), dtype = torch.long).to(DEVICE)

        optimizer.zero_grad()
        loss = model(x_batch, y_batch)

        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        
        batch_idx += 1
        if batch_idx % 10000 == 0:
            end = time.time()
            comp_time = end - begin
            with open("/dlabdata1/youtube_large/jouven/word2vec_pytorch/channels_more_300/batch.pkl",'wb') as f:
                 pickle.dump([(comp_time, batch_idx)], f)
            f.close()
        
    # write embeddings every SAVE_EVERY_N_EPOCH epoch
    if epoch % SAVE_EVERY_N_EPOCH == 0:
        if write_summary:
            writer.add_embedding(model.embeddings_input.weight.data, metadata=[k for k in range(len(vocab_occ))], global_step=epoch)

        torch.save({'model_state_dict': model.state_dict(),
                    'losses': losses,
                    'embedding': model.embeddings_input.weight.data
                    },
                    '{}/model{}.pth'.format(MODEL_DIR, epoch))


EMBEDDING = model.embeddings_input.weight.data



  0%|          | 0/1 [00:00<?, ?it/s]


We have 7.0864 Million trainable parameters here in the model

===== EPOCH 1/1 =====


100%|██████████| 1/1 [00:00<00:00,  2.16it/s]


### Convert the final embedding from torch to Pandas DataFrame and save it

In [10]:
# Convert the embedding from torch to a Pandas DataFrame
graph_matrix = EMBEDDING.cpu().detach().numpy()
df = pd.DataFrame(graph_matrix)
df = df.rename(lambda x: 'dr'+str(x), axis='columns')

In [11]:
df.shape

(17716, 200)

In [12]:
# path where to save the embedding
df.to_csv(EMBEDDING_PATH, compression='gzip', index = False)