This notebook checks the behaviour of the embedding space

In [1]:
import pickle
import random
import glob
import os
import sys
import scipy.sparse
import re
#import torch

import numpy as np
import pandas as pd

from annoy import AnnoyIndex
from scipy.spatial import distance

scriptpath = "../../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

In [2]:
'''
Retrieve the array obtained by apllying the dimentionality reduction algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETER:
    - file_path: the path where the embedding graph is stored

RETURN: 
    - df: DataFrame representing the graph in the embedding space
'''
def create_dataframe_in_embedding_space(model_path):
    graph_matrix = torch.load(model_path)['embedding'].cpu().detach().numpy()
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    return df

def get_dataframe_in_embedding_space(model_path):
    #graph_matrix = torch.load(model_path)['embedding'].cpu().detach().numpy()
    #df = pd.DataFrame(graph_matrix)
    #df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    return pd.read_csv(model_path, compression='gzip')

In [3]:
model_path = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/model1.pth'
df = create_dataframe_in_embedding_space(model_path)

In [4]:
df.shape

(129616, 200)

In [5]:
df.to_csv('/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/embedding1.csv.gz', compression='gzip', index = False)


In [3]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

### Find k closest channel using annoy library

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we should get similar channels if the embedding space is good.

In [19]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler = channelcrawler[channelcrawler['channel_id'].apply(lambda row: row in channels_id)]

In [20]:
channelcrawler['category'].unique()

array(['Film and Animation', 'Music', 'Comedy', 'Gaming',
       'Science & Technology', 'Sports', 'Entertainment', 'Education',
       'Nonprofits & Activism', 'People & Blogs', 'Howto & Style',
       'News & Politics', 'Travel & Events', 'Autos & Vehicles',
       'Pets & Animals', nan], dtype=object)

In [4]:
'''
Retrieve the array obtained by apllying the dimentinality reductin algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETERS:
    - df_embedding: DataFrame representing the graph in the embedding space
    - n_comp: number of components to use after the dimentionalit reduction

RETURN: The annoy index
'''
def get_annoy_index(df):
    index = AnnoyIndex(df.shape[1], "euclidean")  # Length of item vector that will be indexed
    df.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(100) # 100 trees
    return index

In [5]:
def get_k_nearest_neighbors(path, ref_index_channel, k = 20):
    df = get_dataframe_in_embedding_space(path)
    index = get_annoy_index(df)
    nearest_neighbors_index = index.get_nns_by_item(ref_index_channel, k)
    nearest_neighbors_id = [dict_ind_channel[val] for val in nearest_neighbors_index]
    return nearest_neighbors_id

In [18]:
channelcrawler[channelcrawler['name'] == 'TEDx Talks']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
12422,Nonprofits & Activism,2009-06-23,http://www.youtube.com/channel/UCsT0YIqwnpJCM-...,TEDx Talks,20700000,141627,UCsT0YIqwnpJCM-mx7-gSA4Q


In [21]:
path = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/embedding1.csv.gz'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], k= 40)


In [22]:
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,http://www.youtube.com/channel/UCMpOz2KEfkSdd5...,Shaaanxo,3215340,1422
1,UCbO9bltbkYwa56nZFQx6XJg,Howto & Style,2014-07-17,http://www.youtube.com/channel/UCbO9bltbkYwa56...,Manny Mua,4800000,426
2,UCzTKskwIc_-a0cGvCXA848Q,Howto & Style,2008-06-23,http://www.youtube.com/channel/UCzTKskwIc_-a0c...,NikkieTutorials,12269235,742
3,UCkvK_5omS-42Ovgah8KRKtg,Howto & Style,2006-02-14,http://www.youtube.com/channel/UCkvK_5omS-42Ov...,jeffreestar,16000000,357
4,UC21yq4sq8uxTcfgIxxyE9VQ,Howto & Style,2011-06-27,http://www.youtube.com/channel/UC21yq4sq8uxTcf...,Carli Bybel,6190000,546
5,UCGwPbAQdGA3_88WBuGtg9tw,Howto & Style,2010-12-03,http://www.youtube.com/channel/UCGwPbAQdGA3_88...,grav3yardgirl,8560000,1536
6,UCF2oW5-MO8dB6ul9WH9xi0A,People & Blogs,2007-01-04,http://www.youtube.com/channel/UCF2oW5-MO8dB6u...,blndsundoll4mj,4900000,2420
7,UCKMugoa0uHpjUuq14yOpagw,Howto & Style,2009-06-09,http://www.youtube.com/channel/UCKMugoa0uHpjUu...,Laura Lee,4400000,638
8,UCc6W7efUSkd9YYoxOnctlFg,Entertainment,2009-06-08,http://www.youtube.com/channel/UCc6W7efUSkd9YY...,Bethany Mota,10200000,483
9,UC8v4vz_n2rys6Yxpj8LuOBA,Howto & Style,2013-01-26,http://www.youtube.com/channel/UC8v4vz_n2rys6Y...,KathleenLights,4140000,979


In [23]:
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], k= 40)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCZyCposXwcyopaACep44maQ,Howto & Style,2011-09-29,http://www.youtube.com/channel/UCZyCposXwcyopa...,Alex Costa,2120000,543
1,UChNN7VBxPTiNrqjUaQd9bxA,Howto & Style,2012-09-14,http://www.youtube.com/channel/UChNN7VBxPTiNrq...,Teachingmensfashion,4210000,1160
2,UC1KbedtKa3d5dleFR6OjQMg,Howto & Style,2008-07-06,http://www.youtube.com/channel/UC1KbedtKa3d5dl...,alpha m.,5360000,1116
3,UCBdw4dLCLLHmTgAOnW4V0hQ,Entertainment,2005-11-24,http://www.youtube.com/channel/UCBdw4dLCLLHmTg...,The Rock,4470000,193
4,UCsTcErHg8oDvUnTzoqsYeNw,Science & Technology,2010-12-21,http://www.youtube.com/channel/UCsTcErHg8oDvUn...,Unbox Therapy,15300000,1672
5,UCj34AOIMl_k1fF7hcBkD_dw,Science & Technology,2009-01-27,http://www.youtube.com/channel/UCj34AOIMl_k1fF...,EverythingApplePro,7630000,1639
6,UC_hoQDD6zKcIqpIYLsFbBeA,People & Blogs,2013-09-22,http://www.youtube.com/channel/UC_hoQDD6zKcIqp...,Mo Vlogs,8280000,1500
7,UC7eHZXheF8nVOfwB2PEslMw,Comedy,2009-07-06,http://www.youtube.com/channel/UC7eHZXheF8nVOf...,ashish chanchlani vi...,14200000,130
8,UCyEd6QBSgat5kkC6svyjudA,Travel & Events,2009-02-02,http://www.youtube.com/channel/UCyEd6QBSgat5kk...,Mark Wiens,4900000,993
9,UCmlsu3V3SzIm2Jmo0S0qiMg,Entertainment,2009-09-21,http://www.youtube.com/channel/UCmlsu3V3SzIm2J...,TechRax,6650000,501


In [24]:
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCAql2DyGU2un1Ei2nMYsqOA'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCAql2DyGU2un1Ei2nMYsqOA,News & Politics,2015-03-17,http://www.youtube.com/channel/UCAql2DyGU2un1E...,Donald J Trump,189000,222
1,UCupvZG-5ko_eiXAupbDfxWw,News & Politics,2005-10-02,http://www.youtube.com/channel/UCupvZG-5ko_eiX...,CNN,7500000,146299
2,UC8p1vwvWtl6T73JiExfWs1g,News & Politics,2006-09-27,http://www.youtube.com/channel/UC8p1vwvWtl6T73...,CBS News,1870000,71889
3,UCKZGcrxRAhdUi58Mdr565mw,News & Politics,2012-07-27,http://www.youtube.com/channel/UCKZGcrxRAhdUi5...,African Diaspora New...,1100000,3576
4,UCXIJgqnII2ZOINSWNOGFThA,News & Politics,2006-09-19,http://www.youtube.com/channel/UCXIJgqnII2ZOIN...,Fox News,3520000,60195
5,UC1yBKRuGpC1tSM73A0ZjYjQ,News & Politics,2005-12-21,http://www.youtube.com/channel/UC1yBKRuGpC1tSM...,The Young Turks,4470000,37847
6,UCeY0bbntWzzVIaj2z3QigXg,News & Politics,2006-07-19,http://www.youtube.com/channel/UCeY0bbntWzzVIa...,NBC News,1470000,19165
7,UCuFFtHWoLl5fauMMD5Ww2jA,News & Politics,2014-07-25,http://www.youtube.com/channel/UCuFFtHWoLl5fau...,CBC News,1460000,14228
8,UCP6HGa63sBC7-KHtkme-p-g,News & Politics,2006-03-20,http://www.youtube.com/channel/UCP6HGa63sBC7-K...,USA TODAY,1100000,24033
9,UCBi2mrWuNuyYy4gbM6fU18Q,News & Politics,2006-08-07,http://www.youtube.com/channel/UCBi2mrWuNuyYy4...,ABC News,6640000,48869


In [25]:
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UC9pXxdNqCc2zjgRXSoowNNg'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UC9pXxdNqCc2zjgRXSoowNNg,News & Politics,2014-07-09,http://www.youtube.com/channel/UC9pXxdNqCc2zjg...,A Blue Dot In Texas,10600,646
1,UCYY-xsfMg0TKz-gwQOMhcWA,News & Politics,2014-04-15,http://www.youtube.com/channel/UCYY-xsfMg0TKz-...,Jericho Green,158000,446
2,UCOBDYfMV8ckXEUA5F_kixNQ,People & Blogs,2016-07-18,http://www.youtube.com/channel/UCOBDYfMV8ckXEU...,Jon Snow - The Viral...,174000,603
3,UCKw8kdkYfmuNSVehGoDw8Mg,News & Politics,2009-07-21,http://www.youtube.com/channel/UCKw8kdkYfmuNSV...,TYT's The Conversati...,125000,1433
4,UCIAXaU6LIcZyoHYWqVLM4wg,News & Politics,2008-12-23,http://www.youtube.com/channel/UCIAXaU6LIcZyoH...,TYT Nation,125000,6826
5,UCwJB9Z1L0aPIumFCGBTO_yg,News & Politics,2015-06-28,http://www.youtube.com/channel/UCwJB9Z1L0aPIum...,fakeengineer,30500,96
6,UCkr9qZ5tj9dGd7JvE3a8X4Q,News & Politics,2007-05-03,http://www.youtube.com/channel/UCkr9qZ5tj9dGd7...,WLWT,74900,41027
7,UCpimAnNpH_JQxDa0YMTwdnw,Entertainment,2015-11-16,http://www.youtube.com/channel/UCpimAnNpH_JQxD...,HighImpactVlogs,191000,664
8,UCVSSpcmZD2PwPBqb8yKQKBA,Sports,2008-11-06,http://www.youtube.com/channel/UCVSSpcmZD2PwPB...,NBA on ESPN,530000,3642
9,UCbjBOso0vpWgDht9dPIVwhQ,News & Politics,2006-09-26,http://www.youtube.com/channel/UCbjBOso0vpWgDh...,Thom Hartmann Progra...,178000,11791


In [26]:
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCAuUUnT6oDeKwE6v1NGQxug'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCAuUUnT6oDeKwE6v1NGQxug,People & Blogs,2006-12-06,http://www.youtube.com/channel/UCAuUUnT6oDeKwE...,TED,14800000,3105
1,UCLXo7UDZvByw2ixzpQCufnA,News & Politics,2014-03-04,http://www.youtube.com/channel/UCLXo7UDZvByw2i...,Vox,6570000,1048
2,UC1yBKRuGpC1tSM73A0ZjYjQ,News & Politics,2005-12-21,http://www.youtube.com/channel/UC1yBKRuGpC1tSM...,The Young Turks,4470000,37847
3,UCsT0YIqwnpJCM-mx7-gSA4Q,Nonprofits & Activism,2009-06-23,http://www.youtube.com/channel/UCsT0YIqwnpJCM-...,TEDx Talks,20700000,141627
4,UCn8zNIfYAQNdrFRrr8oibKw,Entertainment,2005-12-16,http://www.youtube.com/channel/UCn8zNIfYAQNdrF...,VICE,11700000,2755
5,UCUsN5ZwHx2kILm84-jPDeXw,Comedy,2006-06-14,http://www.youtube.com/channel/UCUsN5ZwHx2kILm...,Comedy Central,8540000,4818
6,UC3XTzVzaHQEd30rQbuvCtTQ,Entertainment,2014-03-18,http://www.youtube.com/channel/UC3XTzVzaHQEd30...,LastWeekTonight,7240000,286
7,UCZaT_X_mc0BI-djXOlfhqWQ,News & Politics,2013-11-20,http://www.youtube.com/channel/UCZaT_X_mc0BI-d...,VICE News,4630000,4219
8,UCpVm7bg6pXKo1Pr6k5kxG9A,Entertainment,2006-05-07,http://www.youtube.com/channel/UCpVm7bg6pXKo1P...,National Geographic,12300000,9260
9,UCuFFtHWoLl5fauMMD5Ww2jA,News & Politics,2014-07-25,http://www.youtube.com/channel/UCuFFtHWoLl5fau...,CBC News,1460000,14228


In [27]:
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCwTkM6CvIsYFaFiMKIKCqHw'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCwTkM6CvIsYFaFiMKIKCqHw,Film and Animation,2005-11-27,http://www.youtube.com/channel/UCwTkM6CvIsYFaF...,James Bond 007,115000,407
1,UC7ezYtIOQSq7_Pk7d5OVJig,Gaming,2009-11-26,http://www.youtube.com/channel/UC7ezYtIOQSq7_P...,DefendTheHouse,540000,538
2,UCDAxJFE1o6uD0YIQXt1jQ6g,Film and Animation,2012-08-13,http://www.youtube.com/channel/UCDAxJFE1o6uD0Y...,Stream Movie Trailer...,253000,626
3,UCIMzhx509wEXMuGkTK-kD9Q,Autos & Vehicles,2006-03-11,http://www.youtube.com/channel/UCIMzhx509wEXMu...,Autocar,738000,1235
4,UCzOEJREOXANNXcFf2OP8hZg,Comedy,2006-10-25,http://www.youtube.com/channel/UCzOEJREOXANNXc...,Kassem G,2447833,482
5,UCNL1ZadSjHpjm4q9j2sVtOA,People & Blogs,2008-05-15,http://www.youtube.com/channel/UCNL1ZadSjHpjm4...,Lady Gaga,14495273,163
6,UCbse--Bp-S6jcfho3pmhzww,Comedy,2006-01-08,http://www.youtube.com/channel/UCbse--Bp-S6jcf...,Scott Gairdner,72868,48
7,UCWTD3EHR2SWV4D8WplDWFVg,Entertainment,2012-09-16,http://www.youtube.com/channel/UCWTD3EHR2SWV4D...,LindseyTime,498000,221
8,UCyoSWGYKkusssZWzRrsX4RA,Autos & Vehicles,2008-10-31,http://www.youtube.com/channel/UCyoSWGYKkusssZ...,SOL - Supercars of L...,790000,1753
9,UCXfCy8pz2hKnGt8_IJphd1Q,Music,2005-11-18,http://www.youtube.com/channel/UCXfCy8pz2hKnGt...,PianoKeyz,462000,170


In [28]:
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,http://www.youtube.com/channel/UCISF5OGuAtSLNF...,Magpiepony,736000,304
1,UC1AEadAUKi6Zt-G3PatrU-Q,Film and Animation,2012-08-19,http://www.youtube.com/channel/UC1AEadAUKi6Zt-...,Letupita725HD★,2698245,2115
2,UCfR-4yZACIMRs3wNf0hel6g,Gaming,2013-06-18,http://www.youtube.com/channel/UCfR-4yZACIMRs3...,Bijuu Mike,2120000,1808
3,UCeBnbqt4VRhotq2TQjkIi2A,Entertainment,2007-02-26,http://www.youtube.com/channel/UCeBnbqt4VRhotq...,LaurenzSide,3410000,1397
4,UCvD0DBYf-gY2dWarc4NEL9Q,Film and Animation,2013-08-27,http://www.youtube.com/channel/UCvD0DBYf-gY2dW...,MC Songs by MC Jams,1710000,328
5,UCK1HgDhsRulLj-pWi8XrqIA,Gaming,2012-07-23,http://www.youtube.com/channel/UCK1HgDhsRulLj-...,Razzbowski,1160000,2101
6,UC_nEHeUEVNY5ZYLRWg8KoZQ,Film and Animation,2016-11-06,http://www.youtube.com/channel/UC_nEHeUEVNY5ZY...,Wolfychu,2480000,101
7,UCzYfz8uibvnB7Yc1LjePi4g,Gaming,2012-08-13,http://www.youtube.com/channel/UCzYfz8uibvnB7Y...,Aphmau,4840000,2991
8,UCZBY6V8Lxmwu8gGRBOyO11w,Gaming,2014-06-16,http://www.youtube.com/channel/UCZBY6V8Lxmwu8g...,Kubz Scouts,3160000,1440
9,UC1EBJfK7ltjYUFyzysKxr1g,Gaming,2014-04-05,http://www.youtube.com/channel/UC1EBJfK7ltjYUF...,Yandere Dev,2590000,159


In [29]:
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UC_aP7p621ATY_yAa8jMqUVA'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UC_aP7p621ATY_yAa8jMqUVA,Science & Technology,2013-04-05,http://www.youtube.com/channel/UC_aP7p621ATY_y...,NASA Video,391000,8232
1,UC686IJyF40geirhYyJEMfIQ,Autos & Vehicles,2012-01-18,http://www.youtube.com/channel/UC686IJyF40geir...,SmurfinWRX,351000,1027
2,UCx3s3t5kpD4VMfJjDi5keXw,Science & Technology,2011-12-25,http://www.youtube.com/channel/UCx3s3t5kpD4VMf...,CoolHardLogic,106000,51
3,UCIMzhx509wEXMuGkTK-kD9Q,Autos & Vehicles,2006-03-11,http://www.youtube.com/channel/UCIMzhx509wEXMu...,Autocar,738000,1235
4,UCuneWYkXOatJ4JH5QBXccBQ,Gaming,2009-11-27,http://www.youtube.com/channel/UCuneWYkXOatJ4J...,MotoGamesTV,203628,7077
5,UC86SBFIAgnYL3ll2ZDgmsuA,Autos & Vehicles,2012-04-18,http://www.youtube.com/channel/UC86SBFIAgnYL3l...,TheStraightPipes,654192,314
6,UCVS6ejD9NLZvjsvhcbiDzjw,Science & Technology,2015-01-15,http://www.youtube.com/channel/UCVS6ejD9NLZvjs...,Crosstalk Solutions,127000,483
7,UCtwmze3hCYoI87uuYkcPFPA,Gaming,2012-11-26,http://www.youtube.com/channel/UCtwmze3hCYoI87...,WeaselZone,387000,2718
8,UCwTkM6CvIsYFaFiMKIKCqHw,Film and Animation,2005-11-27,http://www.youtube.com/channel/UCwTkM6CvIsYFaF...,James Bond 007,115000,407
9,UCvj5S3f10rO6CoibatfRGzg,Music,2008-09-17,http://www.youtube.com/channel/UCvj5S3f10rO6Co...,Jonathan Mann,40000,3753


### Channels selected over the whole comments dataset
We randomly choose 10 000 users over the dataset.
For each user, we then pick two channels at random in the set of channels this user commented in.

In [6]:
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_user_walk.pkl",'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

#### Random walk

In [7]:
def get_random_walk(df_embedding):
    with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_random_walk.pkl",'rb') as f:
         random_walk_channels = pickle.load(f)
    f.close()
    random_walk_distance = 0
    for val in random_walk_channels:
        random_walk_distance += distance.euclidean(df_embedding.iloc[val[0]], df_embedding.iloc[val[1]])
    return random_walk_distance

def get_random_walk_new(df_embedding):
    with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_random_walk_modified.pkl",'rb') as f:
         random_walk_channels = pickle.load(f)
    f.close()
    random_walk_distance = 0
    for val in random_walk_channels:
        random_walk_distance += distance.euclidean(df_embedding.iloc[val[0]], df_embedding.iloc[val[1]])
    return random_walk_distance

#### Compute metrics: users walk distance and relative nearest neighbor ranking 

In this section we want to measure the euclidian distance of a user walk compared to a random walk.

user walk: Euclidean distance in the embedding space between two randomly channels of a user.
random walk: Euclidean distance in the embedding space between two randomly channels.
position: Position of a channel taken from user u relatively of another channel taken from the same user in terms of its nearest neighbor ranking.


In [8]:
'''
Get the position of ref_channel relative to second_channel in terms of its nearest neighbors ranking.
PARAMETER:
    - ref_channel: The reference channel on which wwe compute it's k nearest neighbor
    - second_channel: The channel where we compute it's ranking relatively to ref_channel
    - dist: Euclidean distance between ref_channel and second_channel
    - index: annoy index
    - df_embedding: DataFrame representing the embedding space

RETURN: The position of second_channel relatively to ref_channel in terms of it's ranking

'''
def get_ranking_position_between_channels(ref_channel, second_channel, index, df_embedding):
    
    nearest_neighbors_index = index.get_nns_by_item(ref_channel, len(df_embedding), search_k = 100000000)
    dist_k_th_nearest = distance.euclidean(df_embedding.iloc[ref_channel], 
                                           df_embedding.iloc[nearest_neighbors_index[len(nearest_neighbors_index)-1]])
    for i in range(0, len(nearest_neighbors_index)):
        if nearest_neighbors_index[i] == second_channel:
            return i
    

In [9]:
users_walk_tab = []
users_walk_tab_new = []
ranking_position_tab = []

len_random_set = len(channels_tuple)
len_embedding = len(channels_id)

#path = '/dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/'
#files = glob.glob(path + '*.npz') 
model_1 = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/embedding1.csv.gz'

files = [model_1]
for file in files: 
    print('file ', file)
    df_embedding = get_dataframe_in_embedding_space(file)
    n_comp = df_embedding.shape[1]
    print('n_comp ', n_comp)
    random_walk_distance = get_random_walk(df_embedding)
    random_walk_distance_new = get_random_walk_new(df_embedding)
    index = get_annoy_index(df_embedding)
    users_walk = 0
    ranking_position = 0

    for ref_channel, second_channel in channels_tuple:
        users_walk += distance.euclidean(df_embedding.iloc[ref_channel], df_embedding.iloc[second_channel])
        ranking_position += get_ranking_position_between_channels(ref_channel, second_channel, index, df_embedding)
    
    users_walk_tab.append(users_walk/random_walk_distance)
    users_walk_tab_new.append(users_walk/random_walk_distance_new)
    ranking_position_tab.append(ranking_position/(len_random_set*len_embedding))


file  /dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/embedding1.csv.gz
n_comp  200


In [10]:
users_walk_tab

[0.426133025248665]

In [11]:
ranking_position_tab

[0.08308940511459491]

In [12]:
users_walk_tab_new

[1.7966429514655458]

In [None]:
'''
Plot the results obtained when computing the metrics
'''
import plotly.graph_objects as go

components_name = ['Features dim 50', 'Features dim 100', 'Features dim 200', 'Features dim 500']
colors = ['b', 'g,', 'r', 'c']

fig = go.Figure()

for i in range(len(components_name)):
    fig.add_trace(go.Scatter(
        x=ranking_position_tab,
        y=users_walk_tab,
        marker=dict(color=colors[i], size=12),
        mode="markers",
        name=components_name[i],
    ))

fig.update_layout(title="Gender Earnings Disparity",
                  xaxis_title="Ranking position",
                  yaxis_title="Users walk distance")

fig.show()