## NOTEBOOK DESCRIPTION:

This notebook checks the behaviour of the embedding space: 
- It first checks manually the nearest neighbors of some selected channels
- It also compute the user jumper score which is a first barrier on the evaluation of the embedding

In [1]:
import pickle
import os
import sys
import random

import numpy as np
import pandas as pd


scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_10k import *
from helpers.helpers_channel_embedding import *

scriptpath = "/home/jouven/youtube_projects"
sys.path.append(os.path.abspath(scriptpath))
from helpers.config_threshold_value import *

### Useful functions

In [2]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [3]:
PATH = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/channel_embedding_30M_dim_400.npy'

In [4]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/df_channels_en.tsv.gz", sep='\t')
# Select the rows being in the select channels
channelcrawler = channelcrawler[channelcrawler['channel'].apply(lambda row: row in channels_id)]

In [5]:
channelcrawler.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [6]:
channelcrawler['category_cc'].unique()

array(['Gaming', 'Education', 'Entertainment', 'Howto & Style', 'Sports',
       'Music', 'Film and Animation', 'Comedy', 'Nonprofits & Activism',
       'People & Blogs', 'News & Politics', 'Science & Technology',
       'Pets & Animals', 'Autos & Vehicles', 'Travel & Events', nan],
      dtype=object)

### Manually check the neirest neighbors of some selected channels

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we have a sense of how good/bad is the embedding.

In [7]:
# Looking at the neirest neigbors of `Shaaanxo` being a girl fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,Shaaanxo,3215340,1422,3566.0,2.278
1,UCXbQzhqSvgVZTUyi1T4AU3w,Howto & Style,2011-08-31,Lauren Curtis,3530000,379,3067.0,2.2335
2,UCPRlGA2w7C_DVw-1ynolJYw,People & Blogs,2011-07-19,Shaaanxo Vlogs,555112,742,29412.0,3.015
3,UCLFW3EKD2My9swWH4eTLaYw,Howto & Style,2012-03-06,Chloe Morello,2700000,518,4490.0,2.368
4,UC21yq4sq8uxTcfgIxxyE9VQ,Howto & Style,2011-06-27,Carli Bybel,6190000,546,1234.0,2.087
5,UCz0Qnv6KczUe3NH1wnpmqhA,Howto & Style,2006-11-27,Nicole Guerriero,2870000,664,4160.0,2.34
6,UCE9_EMdxq6C8giPVpAWkJFQ,Entertainment,2011-08-10,BRITTNEYLEESAUNDERS,1100000,584,15533.0,2.8215
7,UC0qI3HpiBua75glb4RV5mWA,Howto & Style,2013-02-21,Samantha Ravndahl,967843,322,17375.0,2.8485
8,UCG9nVSLp4nQlW79sY5ihRrg,Howto & Style,2013-08-17,Desi Perkins,3290000,359,3358.0,2.288
9,UC6jgzx2g3nlbaYkd8EMweKA,Howto & Style,2010-09-07,Jaclyn Hill,5890000,344,1348.0,2.087


In [8]:
# Looking at the neirest neigbors of `Alex Costa` being a men fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCZyCposXwcyopaACep44maQ,Howto & Style,2011-09-29,Alex Costa,2120000,543,5370.0,2.417
1,UCbq8_4_mFAx_rzDF5VT7MJw,Howto & Style,2013-11-11,BluMaan,1530000,381,9009.0,2.552
2,UC5fqfItld8kQufe3djD6wAw,Howto & Style,2011-10-10,Brett Maverick,636000,222,25157.0,2.956
3,UCZqsC13VeSnTIvdvTK-YEvA,Howto & Style,2012-12-01,Mayank Bhattacharya,231121,510,74028.0,3.7605
4,UC1PkRYud11ogYDqgdqd23Zw,Entertainment,2016-05-24,Based Zeus,1280000,203,11725.0,2.6665
5,UCjQG1tv6johDz6qEuaHk6QQ,People & Blogs,2014-05-18,Christian Nielsen,346125,122,53608.0,3.5505
6,UCC-ygwC3ZfSRhIcao8a2zyQ,Howto & Style,2009-01-27,Slikhaar TV - Mens h...,2030000,603,6676.0,2.4925
7,UCQFmSsxwJwOALJZp3GYf7Cg,Howto & Style,2014-06-05,Negeen Dargahi,166000,119,95742.0,3.701
8,UCQPmOWNza6PMesQaWWBEhJA,People & Blogs,2014-05-05,Anabolic Aliens,577000,723,29200.0,2.9985
9,UChNN7VBxPTiNrqjUaQd9bxA,Howto & Style,2012-09-14,Teachingmensfashion,4210000,1160,2093.0,2.1225


In [9]:
# Looking at the neirest neigbors of `Magpiepony` which is a kids film
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,Magpiepony,736000,304,21120.0,2.9225
1,UC94Z4HZJkhPm94YPH1GE3bw,Entertainment,2011-12-13,EileMonty,504000,211,33178.0,3.202
2,UCHKDtSVVEc686EW7jRSCpZA,Entertainment,2007-04-01,Scribbler Production...,327000,1767,54604.0,3.511
3,UC8Wj98MR_oUHBpTjLsE3HuA,Film and Animation,2010-10-10,Pinkie Rose,247000,167,72414.0,3.805
4,UCg3bKR3Z0rCmUe81X06EcaA,Entertainment,2015-11-21,Blank Slate,121000,408,129990.0,4.0625
5,UCPflSzRRyzK2hhbVxNAnrhw,Film and Animation,2012-03-31,Pinkie Pie,1229394,451,11840.0,2.6965
6,UCcZjYWpM72s-1cinYfeSCag,People & Blogs,2010-08-05,DisneyFanatic2364,205000,1026,88028.0,4.3405
7,UCcGuE4cmWhB4XLjSKA8eRFA,Entertainment,2014-08-30,WatchPony,117000,341,144609.0,4.054
8,UCaRdKsgbUdMIp-9FwEsaokg,Film and Animation,2014-02-25,Ashley H,77200,500,196018.0,5.266
9,UChU79FLnClqlRSTSZe3EjNA,Film and Animation,2014-03-21,Flutter525,858000,1297,20180.0,2.8735


In [10]:
# Looking at neirest neigbors of `Beyond the trailer` a channel for analysis of films
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCAXR2zenCwvRIyQd9ydtfaA,Entertainment,2008-11-03,Beyond The Trailer,853000,5064,19744.0,2.8635
1,UCQMbqH7xJu5aTAPQ9y_U7WQ,Entertainment,2009-03-05,Fandom Entertainment...,1570000,10813,9597.0,2.5995
2,UCYyDbdaja1UDNdFSwUrYVGA,Entertainment,2006-02-18,John Campea,186000,3961,83695.0,4.296
3,UC20DNxT_UjT49mYOIocJAww,Entertainment,2012-06-20,ComicBookCast2,473000,7153,34911.0,3.223
4,UCCqEeDAUf4Mg0GgEN658tkA,Entertainment,2011-01-06,Chris Stuckmann,1580000,1400,8486.0,2.56
5,UC7v3-2K1N84V67IF-WTRG-Q,Entertainment,2007-03-07,Jeremy Jahns,1600000,1584,8351.0,2.5415
6,UCtoMyXF4VFY3cB8fUcn7N4A,Entertainment,2006-09-19,AMC Theatres,396000,9046,44969.0,3.34
7,UCMVCs1F_XGueuaD9AfgTWmg,Entertainment,2008-10-20,Movie Trivia Schmoed...,301000,3294,59054.0,3.5435
8,UCWvMmm_sSdgALpo1Ci4WvtQ,Entertainment,2007-02-04,Collider Videos,583000,9074,30710.0,3.0665
9,UCRX7UEyE8kp35mPrgC2sosA,Film and Animation,2011-06-27,JoBlo Movie Trailers...,2070000,5081,6408.0,2.4595


### User jumper implementation

In [4]:
# We randomly choose 3 000 users and for each user we select a random pair of channel
# channels_tuple = [(channel_1, channel_2), (...)]
with open(os.path.join(COMMON_PATH, "channels_tuple_user_walk.pkl"),'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [None]:
files = [PATH]
users_walk_tab, users_walk_tab_new, ranking_position_tab = get_user_walk_and_position_ratio(files, channels_tuple)

In [16]:
print('User walk = ' + str(users_walk_tab))
print('Ranking position ratio = ' + str(ranking_position_tab))

[0.4285789192126741]