## NOTEBOOK DESCRIPTION:

This notebook checks the behaviour of the embedding space: 
- It first checks manually the nearest neighbors of some selected channels
- It also compute the user jumper score which is a first barrier on the evaluation of the embedding

In [1]:
import pickle
import os
import sys
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_300 import *
from helpers.helpers_channel_embedding import *

scriptpath = "/home/jouven/youtube_projects"
sys.path.append(os.path.abspath(scriptpath))
from helpers.config_threshold_value import *

### Useful functions

In [2]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [3]:
PATH = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/combination_then_channel_sampling/CONTEXT_True_100_SUBSAMPLING_False_0.0043_LR_0.005/models/embedding.csv.gz'

In [4]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/df_channels_en.tsv.gz", sep='\t')
# Select the rows being in the select channels
channelcrawler = channelcrawler[channelcrawler['channel'].apply(lambda row: row in channels_id)]

In [5]:
channelcrawler.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [6]:
channelcrawler['category_cc'].unique()

array(['Gaming', 'Education', 'Entertainment', 'Howto & Style', 'Sports',
       'Music', 'Film and Animation', 'Comedy', 'Nonprofits & Activism',
       'People & Blogs', 'News & Politics', 'Science & Technology',
       'Pets & Animals', 'Autos & Vehicles', 'Travel & Events', nan],
      dtype=object)

### Manually check the neirest neighbors of some selected channels

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we have a sense of how good/bad is the embedding.

In [7]:
# Looking at the neirest neigbors of `Shaaanxo` being a girl fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,Shaaanxo,3215340,1422,3566.0,2.278
1,UCCvoAe__WFYMNAEN-C-CtYA,Howto & Style,2008-08-05,Wayne Goss,3670000,1357,2740.0,2.218
2,UCzTKskwIc_-a0cGvCXA848Q,Howto & Style,2008-06-23,NikkieTutorials,12269235,742,312.0,2.087
3,UC8v4vz_n2rys6Yxpj8LuOBA,Howto & Style,2013-01-26,KathleenLights,4140000,979,2308.0,2.168
4,UCKMugoa0uHpjUuq14yOpagw,Howto & Style,2009-06-09,Laura Lee,4400000,638,2083.0,2.12
5,UCxj0QizmFhx7kVKSArHBCTA,Howto & Style,2011-03-09,Casey Holmes,1770000,730,8208.0,2.537
6,UCjHyFOH80C4x_2n_Korx4QQ,People & Blogs,2010-11-03,ThatsHeart,2200000,432,6234.0,2.468
7,UCCgDVqiPU10shxzmwkMwJ6A,Howto & Style,2009-09-22,PiinkSparkles,1170000,1557,13926.0,2.8105
8,UCc6W7efUSkd9YYoxOnctlFg,Entertainment,2009-06-08,Bethany Mota,10200000,483,545.0,2.087
9,UCzTtkLZFy6wqD4SsA9_fTtw,Howto & Style,2009-02-16,Krazyrayray,3570000,374,2954.0,2.222


In [8]:
# Looking at the neirest neigbors of `Alex Costa` being a men fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCZyCposXwcyopaACep44maQ,Howto & Style,2011-09-29,Alex Costa,2120000,543,5370.0,2.417
1,UC1KbedtKa3d5dleFR6OjQMg,Howto & Style,2008-07-06,alpha m.,5360000,1116,1376.0,2.087
2,UChNN7VBxPTiNrqjUaQd9bxA,Howto & Style,2012-09-14,Teachingmensfashion,4210000,1160,2093.0,2.1225
3,UCqwUrj10mAEsqezcItqvwEw,Entertainment,2015-06-20,BB Ki Vines,14962577,148,205.0,2.087
4,UCmRfQHc3U4fV1-i8Ry1HmtA,Howto & Style,2010-12-06,Real Men Real Style,2400000,1315,4867.0,2.3575
5,UCbq8_4_mFAx_rzDF5VT7MJw,Howto & Style,2013-11-11,BluMaan,1530000,381,9009.0,2.552
6,UCFJZQtrh5Ksncayy2FaoNbQ,Education,2014-09-24,Vishuddha Das *Spiri...,576000,382,31243.0,3.1245
7,UCSXFaThRuBGbqOaZsTxZnTQ,People & Blogs,2015-07-10,Mensutra,1340000,123,11589.0,2.6665
8,UC7eHZXheF8nVOfwB2PEslMw,Comedy,2009-07-06,ashish chanchlani vi...,14200000,130,218.0,2.087
9,UCsTcErHg8oDvUnTzoqsYeNw,Science & Technology,2010-12-21,Unbox Therapy,15300000,1672,206.0,2.087


In [9]:
# Looking at the neirest neigbors of `Magpiepony` which is a kids film
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,Magpiepony,736000,304,21120.0,2.9225
1,UC1AEadAUKi6Zt-G3PatrU-Q,Film and Animation,2012-08-19,Letupita725HD★,2698245,2115,4054.0,2.335
2,UCPflSzRRyzK2hhbVxNAnrhw,Film and Animation,2012-03-31,Pinkie Pie,1229394,451,11840.0,2.6965
3,UCHKDtSVVEc686EW7jRSCpZA,Entertainment,2007-04-01,Scribbler Production...,327000,1767,54604.0,3.511
4,UC_5niPa-d35gg88HaS7RrIw,Entertainment,2012-02-11,Disney,3970000,791,2206.0,2.1435
5,UCelMeixAOTs2OQAAi9wU8-g,Entertainment,2013-11-03,CookieSwirlC,12200000,3182,327.0,2.087
6,UCaC1b7X2r8OqxlGhXdDwB_w,Film and Animation,2007-07-08,PhantomSavage,326000,1018,52984.0,3.543
7,UCD_VOth7RmckN6DbmFJa__A,Entertainment,2013-08-31,The Brony Notion,256132,158,70689.0,3.742
8,UChU79FLnClqlRSTSZe3EjNA,Film and Animation,2014-03-21,Flutter525,858000,1297,20180.0,2.8735
9,UCZBY6V8Lxmwu8gGRBOyO11w,Gaming,2014-06-16,Kubz Scouts,3160000,1440,3303.0,2.2885


In [10]:
# Looking at neirest neigbors of `Beyond the trailer` a channel for analysis of films
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCAXR2zenCwvRIyQd9ydtfaA,Entertainment,2008-11-03,Beyond The Trailer,853000,5064,19744.0,2.8635
1,UCs1APU4_lj3PgZz2B4HeadA,Entertainment,2008-01-10,Blind Wave,442000,4580,37540.0,3.2365
2,UCbu2SsF-Or3Rsn3NxqODImw,Gaming,2006-04-09,GameSpot,4090000,33422,2295.0,2.168
3,UCq0OueAsdxH6b8nyAspwViw,Entertainment,2006-06-19,Universal Pictures,2330000,830,4478.0,2.3605
4,UC8-Th83bH_thdKZDJCrn88g,Comedy,2006-01-08,The Tonight Show Sta...,22100000,5450,96.0,2.087
5,UClFSU9_bUb4Rc6OYfTt5SPw,News & Politics,2006-09-16,Philip DeFranco,6400000,2627,1170.0,2.087
6,UCgRQHK8Ttr1j9xCEpCAlgbQ,Entertainment,2006-03-04,Variety,448000,2961,30776.0,3.049
7,UCkkcUZJ-vrt3HomsacbTNLg,Entertainment,2010-02-20,ForneverWorld,533000,4441,33294.0,3.2015
8,UCxo8ooAqXiObjuaIy10ud0A,Science & Technology,2013-10-27,Beyond Science,2100000,469,6564.0,2.493
9,UCGIY_O-8vW4rfX98KlMkvRg,Gaming,2005-12-27,Nintendo,6050000,3791,1180.0,2.087


In [11]:
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCaCTtPkowVggMiUYhQmlcMw'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCaCTtPkowVggMiUYhQmlcMw,Sports,2015-12-30,CrossFit Invictus,27200,196,440262.0,7.116
1,UC6wB_e6YQncYgpv_QrMGHCQ,Sports,2012-06-26,Revive Stronger,21850,777,507999.0,7.1675
2,UCg4o4z4Xrk9SG4wX3Kpmydw,Sports,2011-04-21,Noah Ohlsen,50200,201,258307.0,6.4685
3,UCrnXgvpJ56xtqVvsKh8RYYw,People & Blogs,2009-03-07,BigJsExtremeFitness,101000,1174,166057.0,5.4045
4,UC_kTqkxIUpycqnCGTcbwxow,People & Blogs,2011-10-29,Kara Corey Fit Life,127000,1187,130233.0,4.092
5,UCy18P-QY4lIRAJpvZdsfMeQ,Sports,2014-10-16,Misfit Athletics,18402,404,608916.0,8.5115
6,UCcoMAyqHdAfLPD-1hAQ2r6A,People & Blogs,2012-10-09,Silent Mike,160000,715,99506.0,3.7985
7,UCLqH-U2TXzj1h7lyYQZLNQQ,Sports,2006-11-04,Greg Doucette,82300,458,178614.0,5.1825
8,UCTiLQnvtKMfle0lQaNL817g,Sports,2010-06-12,Ben Pollack,50000,199,279108.0,6.6775
9,UC4514FwdRy5gI6CdC9GPb0w,Sports,2011-03-05,Chris Bumstead,214000,135,66420.0,4.0085


### User jumper implementation

In [7]:
# We randomly choose 3 000 users and for each user we select a random pair of channel
# channels_tuple = [(channel_1, channel_2), (...)]
with open(os.path.join(COMMON_PATH, "channels_tuple_user_walk.pkl"),'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [8]:
files = [PATH]
users_walk_tab, ranking_position_tab = get_user_walk_and_position_ratio(files, channels_tuple)

file  /dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/combination_then_channel_sampling/CONTEXT_True_100_SUBSAMPLING_False_0.0043_LR_0.005/models/embedding.csv.gz
n_comp  200


In [9]:
print('User walk = ' + str(users_walk_tab))
print('Ranking position ratio = ' + str(ranking_position_tab))

User walk = [0.5441455219810676]
Ranking position ratio = [0.0847494043945192]


In [9]:
# Results correspondining to halow number of subscribers
# The file having the channels is positionned at "/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_user_walk_low_subscribers.pkl"
# Only used for the report !
# print('User walk = ' + str(users_walk_tab))
# print('Ranking position ratio = ' + str(ranking_position_tab))

User walk = [0.17131994879327683]
User walk (new way) = [8115.975221785797]
Ranking position ratio = [0.2754236930967362]


### Plot the embedding

A good way to understand the general behavior of the embedding is to project the model in 2D.

In [None]:
ix_to_word, word_to_ix = channel_to_name(channelcrawler, dict_channel_ind)

In [None]:
# Project the embedding in 2D
'''
EMBEDDING = get_dataframe_in_embedding_space(PATH)

print("\n', 'running TSNE...")
tsne = TSNE(n_components = 2).fit_transform(EMBEDDING)
print("tsne.shape: ", tsne.shape) 

# Save the projected embedding in the 2D plane
np.save('/home/jouven/youtube_projects/word2vec_pytorch/embedding_space/run_channels_more_300/combination_then_channel_sampling/CONTEXT_True_100_SUBSAMPLING_False_LR_0.005//tsne.npy', tsne)
'''

In [None]:
# Save the projected embedding in the 2D plane
tsne = np.load('/home/jouven/youtube_projects/word2vec_pytorch/embedding_space/run_channels_more_300/combination_then_channel_sampling/CONTEXT_True_100_SUBSAMPLING_False_LR_0.005//tsne.npy')

Plot to visualize the embeddingg in 2D.
The channels appearing in the plot are the channels having more than 4000k subsribers. In addition, we have 20 colored channels representing the selected pairs in the "axis projection process". 

In [None]:
ix_to_word[dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA']]

In [None]:
############ VISUALIZING ############
SAMPLES = 500
x, y = [], []
annotations = []
for idx, coord in enumerate(tsne):
    # print(coord)
    annotations.append(ix_to_word[idx])
    x.append(coord[0])
    y.append(coord[1])

test_words = [dict_ind_channel[channel] for channel in random.sample(range(len(channels_id)), SAMPLES)]
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_more_4000k_subs.pkl",'rb') as f:
     test_words = pickle.load(f)
f.close()
extended = ['UCISF5OGuAtSLNF24TKTnXag', 'UCej8z9NGaA9Hdd8k5hueXKw']
test_words = random.sample(test_words, SAMPLES)
test_words += extended
plt.figure(figsize = (50, 50))
for i in range(len(test_words)):
    word = ix_to_word[dict_channel_ind[test_words[i]]]
    vocab_idx = word_to_ix[word]
    plt.scatter(x[vocab_idx], y[vocab_idx])
    if test_words[i] in extended:
        plt.annotate(word, xy = (x[vocab_idx], y[vocab_idx]), \
            ha='right',va='bottom', color = 'gray', fontsize = 30)
    else:
        plt.annotate(word, xy = (x[vocab_idx], y[vocab_idx]), \
                ha='right',va='bottom', color = 'gray', fontsize = 12)
#plt.savefig("/home/jouven/youtube_projects/w2v.png")
plt.title("Embedding visualization in 2D using channels having more than 4000k subsribers")
plt.show()