## NOTEBOOK DESCRIPTION:

This notebook checks the behaviour of the embedding space: 
- It first checks manually the nearest neighbors of some selected channels
- It also compute the user jumper score which is a first barrier on the evaluation of the embedding

In [21]:
import pickle
import os
import sys
import random

import numpy as np
import pandas as pd


scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_300 import *
from helpers.helpers_channel_embedding import *

### Useful functions

In [22]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [23]:
PATH = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/channel_embedding.npy'

In [24]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/df_channels_en.tsv.gz", sep='\t')
# Select the rows being in the select channels
channelcrawler = channelcrawler[channelcrawler['channel'].apply(lambda row: row in channels_id)]

In [25]:
channelcrawler.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [26]:
channelcrawler['category_cc'].unique()

array(['Gaming', 'Education', 'Entertainment', 'Howto & Style', 'Sports',
       'Music', 'Film and Animation', 'Comedy', 'Nonprofits & Activism',
       'People & Blogs', 'News & Politics', 'Science & Technology',
       'Pets & Animals', 'Autos & Vehicles', 'Travel & Events', nan],
      dtype=object)

### Manually check the neirest neighbors of some selected channels

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we have a sense of how good/bad is the embedding.

In [27]:
# Looking at the neirest neigbors of `Shaaanxo` being a girl fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,Shaaanxo,3215340,1422,3566.0,2.278
1,UC21yq4sq8uxTcfgIxxyE9VQ,Howto & Style,2011-06-27,Carli Bybel,6190000,546,1234.0,2.087
2,UCK2ACorzpH-igxuHZ2ObCEA,People & Blogs,2010-03-10,itsJudysLife,1660000,2893,8765.0,2.583
3,UCz0Qnv6KczUe3NH1wnpmqhA,Howto & Style,2006-11-27,Nicole Guerriero,2870000,664,4160.0,2.34
4,UCxj0QizmFhx7kVKSArHBCTA,Howto & Style,2011-03-09,Casey Holmes,1770000,730,8208.0,2.537
5,UCZ90tZPn-ue2BmRAcDD88tA,Howto & Style,2010-01-04,Amanda Steele,2700000,437,4606.0,2.387
6,UC9TreTE-iXwfwQl72DzDurA,Howto & Style,2009-01-04,Kandee Johnson,3900000,630,2276.0,2.166
7,UCTKDB3h5zMdf_2siDHsd_yw,Music,2008-09-26,Cimorelli,4980000,631,1593.0,2.087
8,UCJttQwY6KBhwov_2wKywtqA,Howto & Style,2010-06-27,NikkiPhillippi,1360000,959,11569.0,2.677
9,UC7u7O4kGomeLwRxh0lhkDaQ,Entertainment,2010-01-04,Lindsey Hughes,928000,705,18726.0,2.8985


In [28]:
# Looking at the neirest neigbors of `Alex Costa` being a men fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCZyCposXwcyopaACep44maQ,Howto & Style,2011-09-29,Alex Costa,2120000,543,5370.0,2.417
1,UCbvIIQc5Jo9-jIXnkPe03oA,People & Blogs,2015-09-19,Matti Haapoja,749000,346,20693.0,2.9325
2,UCvcm_wQUH4jvM33JgwVJ5Ew,Autos & Vehicles,2014-09-02,Roads Untraveled,224000,408,79028.0,3.672
3,UCjo7oMbNu181jPBs3_lZIJA,Music,2014-06-13,Buffet Boys,680000,156,24769.0,2.9495
4,UCbjw-VrJCUb8PTGoiDI2UMw,Autos & Vehicles,2016-01-03,iamjake,153000,510,104404.0,4.3545
5,UCUpC8yomtIEfrTXLjk-wAmw,Autos & Vehicles,2006-08-07,DoctaM3,128000,1145,130835.0,4.231
6,UCCxawQ5mHZEPP9SvMFar1RA,Autos & Vehicles,2015-05-06,More Skids,146000,365,116776.0,4.5795
7,UCaxW6r282iWvzJmr3BwGc-A,Autos & Vehicles,2015-10-08,Auto Addiction,305592,331,53839.0,3.517
8,UCvhnYODy6WQ0mw_zi3V1h0g,Science & Technology,2011-10-11,JuanBagnell,94189,1139,161753.0,5.4665
9,UCQIyqMWCdx1GBvbw_Yi6lEA,Sports,2006-03-16,Aston Martin Red Bul...,547456,604,29225.0,2.9935


In [None]:
# Looking at the neirest neigbors of `Magpiepony` which is a kids film
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

In [None]:
# Looking at neirest neigbors of `Beyond the trailer` a channel for analysis of films
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

### User jumper implementation

In [4]:
# We randomly choose 3 000 users and for each user we select a random pair of channel
# channels_tuple = [(channel_1, channel_2), (...)]
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_user_walk.pkl",'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [None]:
files = [PATH]
users_walk_tab, users_walk_tab_new, ranking_position_tab = get_user_walk_and_position_ratio(files, channels_tuple)

In [16]:
print('User walk = ' + str(users_walk_tab))
print("User walk (new way) = " + str(users_walk_tab_new))
print('Ranking position ratio = ' + str(ranking_position_tab))

[0.4285789192126741]