## NOTEBOOK DESCRIPTION:

This notebook checks the behaviour of the embedding space: 
- It first checks manually the nearest neighbors of some selected channels
- It also compute the user jumper score which is a first barrier on the evaluation of the embedding

In [22]:
import pickle
import os
import sys
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_10k import *
from helpers.helpers_channel_embedding import *

scriptpath = "/home/jouven/youtube_projects"
sys.path.append(os.path.abspath(scriptpath))
from helpers.config_threshold_value import *

### Useful functions

In [2]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [3]:
PATH = '/dlabdata1/youtube_large/jouven/channel_embedding/channels_more_10k/proximity_graph/channel_by_channel_ln_30/reduced_fpca_200.npz'

In [4]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/df_channels_en.tsv.gz", sep='\t')
# Select the rows being in the select channels
channelcrawler = channelcrawler[channelcrawler['channel'].apply(lambda row: row in channels_id)]

In [5]:
channelcrawler.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [6]:
channelcrawler['category_cc'].unique()

array(['Gaming', 'Education', 'Entertainment', 'Howto & Style', 'Sports',
       'Music', 'Film and Animation', 'Comedy', 'Nonprofits & Activism',
       'People & Blogs', 'News & Politics', 'Science & Technology',
       'Pets & Animals', 'Autos & Vehicles', 'Travel & Events', nan],
      dtype=object)

### Manually check the neirest neighbors of some selected channels

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we have a sense of how good/bad is the embedding.

In [7]:
# Looking at the neirest neigbors of `Shaaanxo` being a girl fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_ind_channel, k= 20, embedding_type = 'proximity_graph')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,Shaaanxo,3215340,1422,3566.0,2.278
1,UC21yq4sq8uxTcfgIxxyE9VQ,Howto & Style,2011-06-27,Carli Bybel,6190000,546,1234.0,2.087
2,UCz0Qnv6KczUe3NH1wnpmqhA,Howto & Style,2006-11-27,Nicole Guerriero,2870000,664,4160.0,2.34
3,UC9TreTE-iXwfwQl72DzDurA,Howto & Style,2009-01-04,Kandee Johnson,3900000,630,2276.0,2.166
4,UCK2ACorzpH-igxuHZ2ObCEA,People & Blogs,2010-03-10,itsJudysLife,1660000,2893,8765.0,2.583
5,UCtC8m_F7jX2fGOQ3_nomg4g,Howto & Style,2006-11-08,Emily Noel,1020000,1750,14802.0,2.7895
6,UCjEPwaev8_zrpafISAPhruA,Howto & Style,2007-06-07,Sam & Nic Chapman,2160000,922,6338.0,2.4625
7,UCLFW3EKD2My9swWH4eTLaYw,Howto & Style,2012-03-06,Chloe Morello,2700000,518,4490.0,2.368
8,UC0FK_3h0Ec2iBKJ_Ah2su7A,Howto & Style,2008-04-07,itsjudytime,1330000,643,11957.0,2.6995
9,UCCvoAe__WFYMNAEN-C-CtYA,Howto & Style,2008-08-05,Wayne Goss,3670000,1357,2740.0,2.218


In [8]:
# Looking at the neirest neigbors of `Alex Costa` being a men fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_ind_channel, k= 20, embedding_type = 'proximity_graph')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCqjwF8rxRsotnojGl4gM0Zw,Sports,2013-02-05,OFFICIALTHENX,4280000,212,1893.0,2.087
1,UCC-ygwC3ZfSRhIcao8a2zyQ,Howto & Style,2009-01-27,Slikhaar TV - Mens h...,2030000,603,6676.0,2.4925
2,UCRAWGR5ySuIDNrotce8pI3w,Entertainment,2015-02-15,SinsTV,1210000,113,12269.0,2.7
3,UCs_6DXZROU29pLvgQdCx4Ww,Education,2014-07-26,Dan Lok,1900000,1363,6772.0,2.492
4,UC97k3hlbE-1rVN8y56zyEEA,Howto & Style,2009-02-02,Bodybuilding.com,4540000,2398,1793.0,2.087
5,UCDOZDfbVGOrywV4n0WeaRAA,Science & Technology,2011-10-13,Bernardo Almeida,389130,525,43443.0,3.3865
6,UCVVAnxQ2YMC_qlc7QfPA2YQ,Howto & Style,2015-05-20,AlmazanKitchen,2740000,371,3711.0,2.27
7,UCfYCRj25JJQ41JGPqiqXmJw,Travel & Events,2007-11-04,Sam Chui,1507443,270,7688.0,2.5365
8,UC0ahC64OhIAS11TJX9Ig86A,Sports,2012-08-04,STRENGTH WARS,1146994,135,14106.0,2.8115
9,UCWwgaK7x0_FR1goeSRazfsQ,Science & Technology,2006-03-11,Samsung,3880000,939,2303.0,2.1665


In [9]:
# Looking at the neirest neigbors of `Magpiepony` which is a kids film
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_ind_channel, k= 20, embedding_type = 'proximity_graph')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,Magpiepony,736000,304,21120.0,2.9225
1,UC94Z4HZJkhPm94YPH1GE3bw,Entertainment,2011-12-13,EileMonty,504000,211,33178.0,3.202
2,UC8Wj98MR_oUHBpTjLsE3HuA,Film and Animation,2010-10-10,Pinkie Rose,247000,167,72414.0,3.805
3,UCPflSzRRyzK2hhbVxNAnrhw,Film and Animation,2012-03-31,Pinkie Pie,1229394,451,11840.0,2.6965
4,UCHKDtSVVEc686EW7jRSCpZA,Entertainment,2007-04-01,Scribbler Production...,327000,1767,54604.0,3.511
5,UCMzfiTDOiuax4C7H1Uu88mw,Gaming,2012-02-17,eric wartick,168734,103,102730.0,3.9895
6,UCsExrmgcYO0e--Q_hCdKfvg,Entertainment,2010-05-26,Nerd Caliber,368000,2145,41049.0,3.37
7,UCcZjYWpM72s-1cinYfeSCag,People & Blogs,2010-08-05,DisneyFanatic2364,205000,1026,88028.0,4.3405
8,UCVv1vQYLgJ6STMJkK2aWMyQ,Entertainment,2015-07-07,Tridashie,252744,48,67004.0,4.0465
9,UCn5wtpa-WcFPeAXZumbZlyw,Film and Animation,2011-07-20,EDplus777,687592,214,24914.0,2.964


In [10]:
# Looking at neirest neigbors of `Beyond the trailer` a channel for analysis of films
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA'], dict_ind_channel, k= 20, embedding_type = 'proximity_graph')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCAXR2zenCwvRIyQd9ydtfaA,Entertainment,2008-11-03,Beyond The Trailer,853000,5064,19744.0,2.8635
1,UC20DNxT_UjT49mYOIocJAww,Entertainment,2012-06-20,ComicBookCast2,473000,7153,34911.0,3.223
2,UCkDSAQ_5-yx5hmuvUcsJL7A,Entertainment,2013-08-16,Mr Sunday Movies,1090000,745,14457.0,2.8405
3,UCKxQmKgrkUv4S7P5w0pLayw,Entertainment,2013-07-09,Comics Explained,1750000,1886,8074.0,2.555
4,UCQMbqH7xJu5aTAPQ9y_U7WQ,Entertainment,2009-03-05,Fandom Entertainment...,1570000,10813,9597.0,2.5995
5,UCQzdMyuz0Lf4zo4uGcEujFw,Entertainment,2007-09-29,GameofThrones,4751781,822,1713.0,2.087
6,UCCqEeDAUf4Mg0GgEN658tkA,Entertainment,2011-01-06,Chris Stuckmann,1580000,1400,8486.0,2.56
7,UC64mtTJJgsU0Lty-7aCB-7w,Film and Animation,2011-06-25,Trevschan2,229000,3926,78996.0,3.678
8,UC7yRILFFJ2QZCykymr8LPwA,News & Politics,2011-10-08,New Rockstars,2340000,1045,5223.0,2.397
9,UCBS7ypf4ccm6e_bu35EiAAA,Film and Animation,2009-04-11,Everything Always,467000,420,31489.0,3.1295


### User jumper implementation

In [13]:
# We randomly choose 3 000 users and for each user we select a random pair of channel
# channels_tuple = [(channel_1, channel_2), (...)]

with open(os.path.join(COMMON_PATH, "channels_tuple_user_walk.pkl"),'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [16]:
files = [PATH]
users_walk_tab, ranking_position_tab = get_user_walk_and_position_ratio(files, channels_tuple, embedding_type = 'proximity_graph')

file  /dlabdata1/youtube_large/jouven/channel_embedding/channels_more_10k/proximity_graph/channel_by_channel_ln_30/reduced_fpca_200.npz
n_comp  200


In [21]:
print('User walk = ' + str(users_walk_tab))
print('Ranking position ratio = ' + str(ranking_position_tab))

User walk = [19.1791386908005]
Ranking position ratio = [0.6086923432273759]
