## NOTEBOOK DESCRIPTION:

This notebook checks the behaviour of the embedding space: 
- It first checks manually the nearest neighbors of some selected channels
- It also compute the user jumper score which is a first barrier on the evaluation of the embedding

In [1]:
import pickle
import os
import sys
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_300 import *
from helpers.helpers_channel_embedding import *

scriptpath = "/home/jouven/youtube_projects"
sys.path.append(os.path.abspath(scriptpath))
from helpers.config_threshold_value import *

### Useful functions

In [2]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [3]:
PATH = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/channel_sampling_then_permutation/CONTEXT_True_20_SUBSAMPLING_True_0.0043_LR_0.018/models/embedding0.csv.gz'

In [4]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/df_channels_en.tsv.gz", sep='\t')
# Select the rows being in the select channels
channelcrawler = channelcrawler[channelcrawler['channel'].apply(lambda row: row in channels_id)]

In [5]:
channelcrawler.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [6]:
channelcrawler['category_cc'].unique()

array(['Gaming', 'Education', 'Entertainment', 'Howto & Style', 'Sports',
       'Music', 'Film and Animation', 'Comedy', 'Nonprofits & Activism',
       'People & Blogs', 'News & Politics', 'Science & Technology',
       'Pets & Animals', 'Autos & Vehicles', 'Travel & Events', nan],
      dtype=object)

### Manually check the neirest neighbors of some selected channels

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we have a sense of how good/bad is the embedding.

In [7]:
# Looking at the neirest neigbors of `Shaaanxo` being a girl fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,Shaaanxo,3215340,1422,3566.0,2.278
1,UC6jgzx2g3nlbaYkd8EMweKA,Howto & Style,2010-09-07,Jaclyn Hill,5890000,344,1348.0,2.087
2,UCbO9bltbkYwa56nZFQx6XJg,Howto & Style,2014-07-17,Manny Mua,4800000,426,1910.0,2.087
3,UCKMugoa0uHpjUuq14yOpagw,Howto & Style,2009-06-09,Laura Lee,4400000,638,2083.0,2.12
4,UCzTKskwIc_-a0cGvCXA848Q,Howto & Style,2008-06-23,NikkieTutorials,12269235,742,312.0,2.087
5,UCGwPbAQdGA3_88WBuGtg9tw,Howto & Style,2010-12-03,grav3yardgirl,8560000,1536,725.0,2.087
6,UC21yq4sq8uxTcfgIxxyE9VQ,Howto & Style,2011-06-27,Carli Bybel,6190000,546,1234.0,2.087
7,UC9TreTE-iXwfwQl72DzDurA,Howto & Style,2009-01-04,Kandee Johnson,3900000,630,2276.0,2.166
8,UCF2oW5-MO8dB6ul9WH9xi0A,People & Blogs,2007-01-04,blndsundoll4mj,4900000,2420,1808.0,2.087
9,UCc6W7efUSkd9YYoxOnctlFg,Entertainment,2009-06-08,Bethany Mota,10200000,483,545.0,2.087


In [8]:
# Looking at the neirest neigbors of `Alex Costa` being a men fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCZyCposXwcyopaACep44maQ,Howto & Style,2011-09-29,Alex Costa,2120000,543,5370.0,2.417
1,UChNN7VBxPTiNrqjUaQd9bxA,Howto & Style,2012-09-14,Teachingmensfashion,4210000,1160,2093.0,2.1225
2,UC_hoQDD6zKcIqpIYLsFbBeA,People & Blogs,2013-09-22,Mo Vlogs,8280000,1500,693.0,2.087
3,UCJ5v_MCY6GNUBTO8-D3XoAg,Sports,2007-05-11,WWE,48400000,43421,11.0,2.087
4,UC64guZp8DXzqrIQ5OPedviw,Entertainment,2014-05-24,HoomanTV,7880000,61,706.0,2.087
5,UCqhnX4jA0A5paNd1v-zEysw,Sports,2009-03-11,GoPro,7810000,2096,762.0,2.087
6,UCBJycsmduvYEL83R_U4JriQ,Science & Technology,2008-03-21,Marques Brownlee,9400000,1148,530.0,2.087
7,UCSDMmlBoH5wJrKm82DpqXDA,Gaming,2014-12-17,General Tony,2460000,2178,5095.0,2.3905
8,UCO2WJZKQoDW4Te6NHx4KfTg,Science & Technology,2011-01-06,Geekyranjit,2766278,2648,4144.0,2.336
9,UCTqMx8l2TtdZ7_1A40qrFiQ,Science & Technology,2013-06-18,XEETECHCARE,1300000,4011,10892.0,2.625


In [9]:
# Looking at the neirest neigbors of `Magpiepony` which is a kids film
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,Magpiepony,736000,304,21120.0,2.9225
1,UCfR-4yZACIMRs3wNf0hel6g,Gaming,2013-06-18,Bijuu Mike,2120000,1808,5865.0,2.4595
2,UC_nEHeUEVNY5ZYLRWg8KoZQ,Film and Animation,2016-11-06,Wolfychu,2480000,101,4780.0,2.364
3,UCzfyYtgvkx5mLy8nlLlayYg,Film and Animation,2012-09-09,Vivziepop,1430000,225,10041.0,2.6435
4,UCqg2eLFNUu3QN3dttNeOWkw,Gaming,2010-08-20,iHasCupquake,6620000,4261,1109.0,2.087
5,UCfguohQ13Uez5UWqJbZA5Qg,Music,2012-09-09,VideoGameRapBattles,365258,93,46158.0,3.309
6,UCzYfz8uibvnB7Yc1LjePi4g,Gaming,2012-08-13,Aphmau,4840000,2991,1807.0,2.087
7,UCD_VOth7RmckN6DbmFJa__A,Entertainment,2013-08-31,The Brony Notion,256132,158,70689.0,3.742
8,UCK1HgDhsRulLj-pWi8XrqIA,Gaming,2012-07-23,Razzbowski,1160000,2101,13434.0,2.8135
9,UCt4qeayu0d0Yo5NHx_3xY9A,Gaming,2012-11-12,SirSkyward,185000,1920,97587.0,3.395


In [10]:
# Looking at neirest neigbors of `Beyond the trailer` a channel for analysis of films
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA'], dict_ind_channel, k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCAXR2zenCwvRIyQd9ydtfaA,Entertainment,2008-11-03,Beyond The Trailer,853000,5064,19744.0,2.8635
1,UCCqEeDAUf4Mg0GgEN658tkA,Entertainment,2011-01-06,Chris Stuckmann,1580000,1400,8486.0,2.56
2,UCjmJDM5pRKbUlVIzDYYWb6g,Entertainment,2006-10-17,Warner Bros. Picture...,7660000,2397,779.0,2.087
3,UCDiFRMQWpcp8_KD4vwIVicw,Entertainment,2012-10-12,Emergency Awesome,3210000,3949,3268.0,2.275
4,UC8-Th83bH_thdKZDJCrn88g,Comedy,2006-01-08,The Tonight Show Sta...,22100000,5450,96.0,2.087
5,UCaWd5_7JhbQBe4dknZhsHJg,Entertainment,2007-01-25,WatchMojo.com,21284359,17081,114.0,2.087
6,UCay_OLhWtf9iklq8zg_or0g,People & Blogs,2013-04-09,As/Is,10600000,3932,484.0,2.087
7,UCOpcACMWblDls9Z6GERVi1A,Film and Animation,2008-10-03,Screen Junkies,6660000,1014,1096.0,2.087
8,UCp0hYYBW6IMayGgR-WeoCvQ,Entertainment,2006-11-21,TheEllenShow,34400000,10692,28.0,2.087
9,UCP1iRaFlS5EYjJBryFV9JPw,Entertainment,2015-11-25,Looper,4820000,2533,1758.0,2.087


### User jumper implementation

In [7]:
# We randomly choose 3 000 users and for each user we select a random pair of channel
# channels_tuple = [(channel_1, channel_2), (...)]
with open(os.path.join(COMMON_PATH, "channels_tuple_user_walk.pkl"),'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [8]:
files = [PATH]
users_walk_tab, ranking_position_tab = get_user_walk_and_position_ratio(files, channels_tuple)

file  /dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/channel_sampling_then_permutation/CONTEXT_True_20_SUBSAMPLING_True_0.0043_LR_0.018/models/embedding0.csv.gz
n_comp  200


In [9]:
print('User walk = ' + str(users_walk_tab))
print('Ranking position ratio = ' + str(ranking_position_tab))

User walk = [0.41959635266233225]
Ranking position ratio = [0.0871721767888738]
