## NOTEBOOK DESCRIPTION:

This notebook checks the behaviour of the embedding space: 
- It first checks manually the nearest neighbors of some selected channels
- It also compute the user jumper score which is a first barrier on the evaluation of the embedding

In [1]:
import pickle
import os
import sys
import random

import numpy as np
import pandas as pd


scriptpath = "/home/jouven/youtube_projects/"
sys.path.append(os.path.abspath(scriptpath))

from helpers.helpers_channels_more_10k import *
from helpers.helpers_channel_embedding import *

scriptpath = "/home/jouven/youtube_projects"
sys.path.append(os.path.abspath(scriptpath))
from helpers.config_threshold_value import *

### Useful functions

In [2]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [3]:
PATH = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_10k/channel_embedding_30M_diff_channels_7.npy'

In [4]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/df_channels_en.tsv.gz", sep='\t')
# Select the rows being in the select channels
channelcrawler = channelcrawler[channelcrawler['channel'].apply(lambda row: row in channels_id)]

In [5]:
channelcrawler.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [6]:
channelcrawler['category_cc'].unique()

array(['Gaming', 'Education', 'Entertainment', 'Howto & Style', 'Sports',
       'Music', 'Film and Animation', 'Comedy', 'Nonprofits & Activism',
       'People & Blogs', 'News & Politics', 'Science & Technology',
       'Pets & Animals', 'Autos & Vehicles', 'Travel & Events', nan],
      dtype=object)

### Manually check the neirest neighbors of some selected channels

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we have a sense of how good/bad is the embedding.

In [7]:
# Looking at the neirest neigbors of `Shaaanxo` being a girl fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,Shaaanxo,3215340,1422,3566.0,2.278
1,UCUt0ZA6l_EidUnBFMR9BZig,Howto & Style,2009-11-28,SMLx0,1000000,645,16293.0,2.862
2,UCeZn7qRN3JKVcEYK_dDg7JQ,Howto & Style,2010-11-19,leighannsays,816000,756,19205.0,2.865
3,UCXbQzhqSvgVZTUyi1T4AU3w,Howto & Style,2011-08-31,Lauren Curtis,3530000,379,3067.0,2.2335
4,UCydYMBWbpGgt0RIfNVXGxCA,Howto & Style,2011-04-03,Andréa Matillano,304000,1074,58530.0,3.59
5,UCPRlGA2w7C_DVw-1ynolJYw,People & Blogs,2011-07-19,Shaaanxo Vlogs,555112,742,29412.0,3.015
6,UCvxXnjcMUY46qEJ6I-1dThw,Howto & Style,2008-12-31,ThatGirlShaeXo,397000,667,41405.0,3.3705
7,UC3JxYPRHnB5TvaCFuKfcdJQ,Howto & Style,2012-03-02,Chrisspy,1660000,258,8889.0,2.565
8,UC1BB4CsKykhht_25SQUNLjA,People & Blogs,2013-06-07,CaseyHolmesVlogs91,640000,183,27439.0,2.976
9,UC5CLbqqdnKUzYNyS_kMbnIg,Howto & Style,2012-08-17,Young Wild and Polis...,158000,630,110411.0,5.0615


In [8]:
# Looking at the neirest neigbors of `Alex Costa` being a men fashion channel
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCZyCposXwcyopaACep44maQ,Howto & Style,2011-09-29,Alex Costa,2120000,543,5370.0,2.417
1,UCbq8_4_mFAx_rzDF5VT7MJw,Howto & Style,2013-11-11,BluMaan,1530000,381,9009.0,2.552
2,UCcGYJndqreqXMUZoOhn3E7A,Science & Technology,2012-04-10,JimsReviewRoom,829000,588,18808.0,2.9105
3,UCK-H1e0S8jg-8qoqQ5N8jvw,Science & Technology,2012-07-26,Explore Gadgets,549000,779,30007.0,3.044
4,UCk1SpWNzOs4MYmr0uICEntg,Science & Technology,2009-12-25,xdadevelopers,740000,1953,22813.0,2.961
5,UCdp6GUwjKscp5ST4M4WgIpw,Science & Technology,2011-09-11,TechWiser,323200,236,51388.0,3.4635
6,UCBKH2wfmHxht7YyA2dVaWew,Science & Technology,2011-11-09,Gadgets Portal,309006,708,58829.0,3.5825
7,UCzLaQ6eeTVuAltzTrN7fzyg,Science & Technology,2012-02-16,The YouTube Tech Guy...,314000,3205,56986.0,3.532
8,UCruQUP9qeou8KGv4G4HWCMQ,Science & Technology,2012-08-17,Intellect Digest,376843,2521,48535.0,3.406
9,UCsNGtpqGsyw0U6qEG-WHadA,Film and Animation,2010-06-06,DJI,859000,868,18358.0,2.9345


In [9]:
# Looking at the neirest neigbors of `Magpiepony` which is a kids film
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,Magpiepony,736000,304,21120.0,2.9225
1,UCtoBoBNd-LfTGWICZdeodUA,People & Blogs,2015-05-23,FluttershyPlay's,80300,68,199547.0,5.338
2,UC7WDDV2_V5QLIk0su7hmmbg,Entertainment,2013-09-02,MusicGirl Pro,194000,204,84619.0,4.366
3,UCiDCjpVAi-TV0nGtjF_rx_Q,Entertainment,2015-02-25,Little Kelly & Frien...,183624,496,98458.0,3.6775
4,UCCFwisNDDTZLMk9Anv--8Gw,People & Blogs,2016-03-25,Juliya,170000,25,99830.0,3.731
5,UCoWsJwtoDzBfFI-egTXG6Dg,Entertainment,2014-11-15,Misty Brick Toys,295000,376,61289.0,3.7875
6,UCiM9mWNxTWpKY-V9Eb9TfkA,Film and Animation,2014-10-10,The Beginners Bible,160917,243,99787.0,3.717
7,UCMzfiTDOiuax4C7H1Uu88mw,Gaming,2012-02-17,eric wartick,168734,103,102730.0,3.9895
8,UCejH9OVMqJmzerevPFqPd2w,Entertainment,2016-12-27,Big WOU Channel,306000,179,51566.0,3.4685
9,UCVcQH8A634mauPrGbWs7QlQ,Education,2010-07-14,Jack Hartmann Kids M...,665000,878,20512.0,2.92


In [10]:
# Looking at neirest neigbors of `Beyond the trailer` a channel for analysis of films
nearest_neighbors_id = get_k_nearest_neighbors(PATH, dict_channel_ind['UCAXR2zenCwvRIyQd9ydtfaA'], dict_ind_channel, k= 20, embedding_type = 'word2vecf')
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel,category_cc,join_date,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,UCAXR2zenCwvRIyQd9ydtfaA,Entertainment,2008-11-03,Beyond The Trailer,853000,5064,19744.0,2.8635
1,UCtoMyXF4VFY3cB8fUcn7N4A,Entertainment,2006-09-19,AMC Theatres,396000,9046,44969.0,3.34
2,UCPOrN8u0yH-gIEb7SkMwkTw,Entertainment,2012-04-06,Comic Book Girl 19,525000,404,35166.0,3.2045
3,UCinjnmQEwCddOudyCC1v7qA,News & Politics,2006-06-13,KTLA 5,106000,4650,117556.0,4.595
4,UCkqg9F6VUHWDZB4rR--NM7w,Gaming,2006-04-10,DBZanto Z,695000,1852,25053.0,2.963
5,UC7sDT8jZ76VLV1u__krUutA,Film and Animation,2016-07-29,DUST,1290000,480,9119.0,2.5585
6,UC_Oa7Ph3v94om5OyxY1nPKg,Howto & Style,2009-06-03,Paul Davids,1319463,269,9978.0,2.6355
7,UCl_Ydej82Zq4NEZnwxeaUwg,Music,2006-04-28,Stompdown,189000,616,92383.0,3.914
8,UC1nw_szfrEsDWcwD32wHE_w,Entertainment,2006-03-05,q on cbc,164000,2596,96946.0,3.426
9,UCveZqqGewoyPiacooywP5Ig,Education,2012-05-08,Alt Shift X,1130000,113,13313.0,2.7875


### User jumper implementation

In [4]:
# We randomly choose 3 000 users and for each user we select a random pair of channel
# channels_tuple = [(channel_1, channel_2), (...)]
with open(os.path.join(COMMON_PATH, "channels_tuple_user_walk.pkl"),'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

In [None]:
files = [PATH]
users_walk_tab, users_walk_tab_new, ranking_position_tab = get_user_walk_and_position_ratio(files, channels_tuple)

In [16]:
print('User walk = ' + str(users_walk_tab))
print('Ranking position ratio = ' + str(ranking_position_tab))

[0.4285789192126741]