## NOTEBOOK DESCRIPTION:

This notebook checks the behaviour of the embedding space: 
- It first checks manually the nearest neighbors of some selected channels
- It also compute the user jumper score which is a first barrier on the evaluation of the embedding

In [3]:
import pickle
import os
import sys
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_300 import *
from helpers.helpers_channel_embedding import *

scriptpath = "/home/jouven/youtube_projects"
sys.path.append(os.path.abspath(scriptpath))
from helpers.config_threshold_value import *

### Useful functions

In [4]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [5]:
PATH = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/channel_sampling_then_permutation/CONTEXT_True_20_SUBSAMPLING_False_0.0043_LR_0.018_NEG_100/models/embedding0.csv.gz'

In [6]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/df_channels_en.tsv.gz", sep='\t')
# Select the rows being in the select channels
channelcrawler = channelcrawler[channelcrawler['channel'].apply(lambda row: row in channels_id)]

In [7]:
channelcrawler.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [8]:
channelcrawler['category_cc'].unique()

array(['Gaming', 'Education', 'Entertainment', 'Howto & Style', 'Sports',
       'Music', 'Film and Animation', 'Comedy', 'Nonprofits & Activism',
       'People & Blogs', 'News & Politics', 'Science & Technology',
       'Pets & Animals', 'Autos & Vehicles', 'Travel & Events', nan],
      dtype=object)

### Manually check the neirest neighbors of some selected channels

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we have a sense of how good/bad is the embedding.

In [7]:
# Looking at the neirest neigbors of `Shaaanxo` being a girl fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,Shaaanxo,3215340,1422,3566.0,2.278
1,UCbO9bltbkYwa56nZFQx6XJg,Howto & Style,2014-07-17,Manny Mua,4800000,426,1910.0,2.087
2,UCc6W7efUSkd9YYoxOnctlFg,Entertainment,2009-06-08,Bethany Mota,10200000,483,545.0,2.087
3,UCKMugoa0uHpjUuq14yOpagw,Howto & Style,2009-06-09,Laura Lee,4400000,638,2083.0,2.12
4,UCLwUd5KtYONsRJ3UAOojZ0w,People & Blogs,2007-10-22,Freelee The BananaGi...,786000,947,20184.0,2.875
5,UCzTKskwIc_-a0cGvCXA848Q,Howto & Style,2008-06-23,NikkieTutorials,12269235,742,312.0,2.087
6,UCkvK_5omS-42Ovgah8KRKtg,Howto & Style,2006-02-14,jeffreestar,16000000,357,222.0,2.087
7,UC6jgzx2g3nlbaYkd8EMweKA,Howto & Style,2010-09-07,Jaclyn Hill,5890000,344,1348.0,2.087
8,UC9TreTE-iXwfwQl72DzDurA,Howto & Style,2009-01-04,Kandee Johnson,3900000,630,2276.0,2.166
9,UCzco9CewPf0F-SP1p6LhWrw,Howto & Style,2008-07-07,Gigi Gorgeous,2910000,513,4079.0,2.326


In [8]:
# Looking at the neirest neigbors of `Alex Costa` being a men fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCZyCposXwcyopaACep44maQ,Howto & Style,2011-09-29,Alex Costa,2120000,543,5370.0,2.417
1,UChNN7VBxPTiNrqjUaQd9bxA,Howto & Style,2012-09-14,Teachingmensfashion,4210000,1160,2093.0,2.1225
2,UCJ5v_MCY6GNUBTO8-D3XoAg,Sports,2007-05-11,WWE,48400000,43421,11.0,2.087
3,UC_hoQDD6zKcIqpIYLsFbBeA,People & Blogs,2013-09-22,Mo Vlogs,8280000,1500,693.0,2.087
4,UCBdw4dLCLLHmTgAOnW4V0hQ,Entertainment,2005-11-24,The Rock,4470000,193,1879.0,2.087
5,UCMiJRAwDNSNzuYeN2uWa0pA,Science & Technology,2011-04-20,Mrwhosetheboss,2740000,1215,3367.0,2.289
6,UCu4X846OSea5YU6S8fIpy1A,Comedy,2012-06-20,BigDawsTv,6340000,225,1020.0,2.087
7,UCqhnX4jA0A5paNd1v-zEysw,Sports,2009-03-11,GoPro,7810000,2096,762.0,2.087
8,UCvC4D8onUfXzvjTOM-dBfEA,Entertainment,2005-06-16,Marvel Entertainment...,13200000,6332,302.0,2.087
9,UCbq8_4_mFAx_rzDF5VT7MJw,Howto & Style,2013-11-11,BluMaan,1530000,381,9009.0,2.552


In [9]:
# Looking at the neirest neigbors of `Magpiepony` which is a kids film
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,Magpiepony,736000,304,21120.0,2.9225
1,UCzYfz8uibvnB7Yc1LjePi4g,Gaming,2012-08-13,Aphmau,4840000,2991,1807.0,2.087
2,UC1RMBtYn0Fw8Pf7KuvkrRNw,Gaming,2009-11-01,Vannamelon,919000,241,17845.0,2.8605
3,UCHKDtSVVEc686EW7jRSCpZA,Entertainment,2007-04-01,Scribbler Production...,327000,1767,54604.0,3.511
4,UCkQ3eqNNaFrBZ_WbnCIMN8w,Film and Animation,2011-10-19,shgurr,2060000,68,6256.0,2.4685
5,UC94Z4HZJkhPm94YPH1GE3bw,Entertainment,2011-12-13,EileMonty,504000,211,33178.0,3.202
6,UCqg2eLFNUu3QN3dttNeOWkw,Gaming,2010-08-20,iHasCupquake,6620000,4261,1109.0,2.087
7,UC-r13SLLdZtZNmuC2bMnlmw,Film and Animation,2013-09-27,Hasbro,3620000,8328,2857.0,2.228
8,UCZBY6V8Lxmwu8gGRBOyO11w,Gaming,2014-06-16,Kubz Scouts,3160000,1440,3303.0,2.2885
9,UC1EBJfK7ltjYUFyzysKxr1g,Gaming,2014-04-05,Yandere Dev,2590000,159,4657.0,2.389


In [10]:
# Looking at neirest neigbors of `Beyond the trailer` a channel for analysis of films
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCAXR2zenCwvRIyQd9ydtfaA,Entertainment,2008-11-03,Beyond The Trailer,853000,5064,19744.0,2.8635
1,UCCqEeDAUf4Mg0GgEN658tkA,Entertainment,2011-01-06,Chris Stuckmann,1580000,1400,8486.0,2.56
2,UCa6vGFO9ty8v5KZJXQxdhaw,Entertainment,2006-09-20,Jimmy Kimmel Live,15200000,4207,212.0,2.087
3,UCjmJDM5pRKbUlVIzDYYWb6g,Entertainment,2006-10-17,Warner Bros. Picture...,7660000,2397,779.0,2.087
4,UCi8e0iOVk1fEOogdfu4YgfA,Film and Animation,2011-04-01,Movieclips Trailers,13900000,8084,267.0,2.087
5,UCqFzWxSCi39LnW1JKFR3efg,Entertainment,2013-07-23,Saturday Night Live,8750000,6873,584.0,2.087
6,UC7v3-2K1N84V67IF-WTRG-Q,Entertainment,2007-03-07,Jeremy Jahns,1600000,1584,8351.0,2.5415
7,UCOpcACMWblDls9Z6GERVi1A,Film and Animation,2008-10-03,Screen Junkies,6660000,1014,1096.0,2.087
8,UCQMbqH7xJu5aTAPQ9y_U7WQ,Entertainment,2009-03-05,Fandom Entertainment...,1570000,10813,9597.0,2.5995
9,UCftwRNsjfRo08xYE31tkiyw,Science & Technology,2005-09-23,WIRED,5100000,2902,1354.0,2.087


### User jumper implementation

In [9]:
# We randomly choose 3 000 users and for each user we select a random pair of channel
# channels_tuple = [(channel_1, channel_2), (...)]
with open(os.path.join(COMMON_PATH, "channels_tuple_user_walk.pkl"),'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [10]:
files = [PATH]
users_walk_tab, ranking_position_tab = get_user_walk_and_position_ratio(files, channels_tuple)

file  /dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/channel_sampling_then_permutation/CONTEXT_True_20_SUBSAMPLING_False_0.0043_LR_0.018_NEG_100/models/embedding0.csv.gz
n_comp  200


In [11]:
print('User walk = ' + str(users_walk_tab))
print('Ranking position ratio = ' + str(ranking_position_tab))

User walk = [0.4400919277871753]
Ranking position ratio = [0.07628508028844176]
