This notebook checks the behaviour of the embedding space

In [1]:
import re
import os
import sys
import pickle
import random
import glob
import scipy.sparse

import numpy as np
import pandas as pd

from annoy import AnnoyIndex
from scipy.spatial import distance

scriptpath = "../../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

In [12]:
haha = pd.DataFrame([(1, 2), (4, 5), (6, 7), (8, 9)])

In [21]:
sel = set([1, 2, 4])

In [28]:
(haha[0].isin(sel)) & (haha[1].isin(sel))

0     True
1    False
2    False
3    False
dtype: bool

In [29]:
haha.loc[(haha[0].isin(sel)) & (haha[1].isin(sel))]

Unnamed: 0,0,1
0,1,2


In [2]:
'''
Retrieve the array obtained by apllying the dimentionality reduction algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETER:
    - file_path: the path where the embedding graph is stored

RETURN: 
    - df: DataFrame representing the graph in the embedding space
'''
def get_dataframe_in_embedding_space(file_path):
    graph_matrix = np.load(file_path)
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    df['index'] = np.load('/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_10/channel_embeddingindex_ordering.npy').astype(int)
    df = df.sort_values(by=['index'])
    return df.set_index('index')


In [3]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

### Find k closest channels using annoy library

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we should get similar channels if the embedding space is good.

In [4]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [5]:
channelcrawler = channelcrawler[channelcrawler['channel_id'].apply(lambda row: row in channels_id)]

In [6]:
channelcrawler['category'].unique()

array(['Film and Animation', 'Music', 'Comedy', 'Gaming',
       'Science & Technology', 'Sports', 'Entertainment', 'Education',
       'Nonprofits & Activism', 'People & Blogs', 'Howto & Style',
       'News & Politics', 'Travel & Events', 'Autos & Vehicles',
       'Pets & Animals', nan], dtype=object)

In [7]:
channelcrawler[channelcrawler['category'] == 'Film and Animation'][50:100]

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
1562,Film and Animation,2015-11-24,http://www.youtube.com/channel/UC1cDYEgIGtuHdG...,White Shadow,37000,124,UC1cDYEgIGtuHdGi4SXWi1kA
1619,Film and Animation,2016-11-05,http://www.youtube.com/channel/UC2Yt60q-3i1til...,UnraveledPictures TM...,12700,59,UC2Yt60q-3i1til4CWpEbdWg
1680,Film and Animation,2009-10-16,http://www.youtube.com/channel/UCxP5OANXyoTlm0...,ElectricDragon505,99700,1017,UCxP5OANXyoTlm0B2W0lvZaQ
1774,Film and Animation,2016-05-22,http://www.youtube.com/channel/UCfQkASkyBNh2qs...,KaRon Smith,38400,1321,UCfQkASkyBNh2qs_vGpfQAPg
1782,Film and Animation,2014-11-08,http://www.youtube.com/channel/UCS85PXHRkizrbG...,Karsten Runquist,211000,117,UCS85PXHRkizrbGHBCe4tV3g
1797,Film and Animation,2016-09-08,http://www.youtube.com/channel/UCe5TCe6L-uYM10...,Pro GK Academy,55300,94,UCe5TCe6L-uYM106WzSIKuYQ
1845,Film and Animation,2011-08-23,http://www.youtube.com/channel/UCbmykBGr4OQTTr...,"MAV Films, LLC",12100,22,UCbmykBGr4OQTTr0V6c_bjOQ
1893,Film and Animation,2012-03-04,http://www.youtube.com/channel/UCTFdqzW7yNZytw...,dannphan29,16932,334,UCTFdqzW7yNZytwQdymlcrrA
1957,Film and Animation,2009-06-17,http://www.youtube.com/channel/UCUTIunWnyDoyXY...,Foxtrod,21300,147,UCUTIunWnyDoyXYJE7UFosZg
2057,Film and Animation,2015-12-26,http://www.youtube.com/channel/UCHADgMhNFbQIa1...,Marcia,26700,37,UCHADgMhNFbQIa1muBHKA4dg


In [8]:
'''
    Retrieve the array obtained by apllying the dimentinality reductin algorithm
    graph_matrix: SHAPE: (channels, n_comp)
    
    PARAMETERS:
        - df_embedding: DataFrame representing the graph in the embedding space
        
    RETURN: The annoy index
    '''
def get_annoy_index(df):
    index = AnnoyIndex(df.shape[1], "euclidean")  # Length of item vector that will be indexed
    df.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(100) # 100 trees
    return index

In [9]:
def get_k_nearest_neighbors(path, ref_index_channel, k = 20):
    df = get_dataframe_in_embedding_space(path)
    index = get_annoy_index(df)
    nearest_neighbors_index = index.get_nns_by_item(ref_index_channel, k, search_k = 100000000)
    nearest_neighbors_id = [dict_ind_channel[val] for val in nearest_neighbors_index]
    return nearest_neighbors_id

In [10]:
dict_channel_ind['UCLFW3EKD2My9swWH4eTLaYw']

45013

In [11]:
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_10/channel_embedding.npy'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], k= 40)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

FileNotFoundError: [Errno 2] No such file or directory: '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_10/channel_embedding.npy'

In [38]:
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channel_embedding.npy'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCZyCposXwcyopaACep44maQ'], k= len(channels_id)-1)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)

Chloe Morello  52419
Lauren Curtis  52969
Shanxo  55894


In [27]:
nearest_neighbors[55894:55894+50]

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
55894,UCMpOz2KEfkSdd5JeIJh_fxw,Howto & Style,2009-01-18,http://www.youtube.com/channel/UCMpOz2KEfkSdd5...,Shaaanxo,3215340,1422
55895,UCf8w5m0YsRa8MHQ5bwSGmbw,News & Politics,2008-09-25,http://www.youtube.com/channel/UCf8w5m0YsRa8MH...,asianetnews,3270000,102074
55896,UCv3rFzn-GHGtqzXiaq3sWNg,News & Politics,2012-06-01,http://www.youtube.com/channel/UCv3rFzn-GHGtqz...,ABP ANANDA,3030000,65624
55897,UCX9hgl0cGni5PLcNKzs0TSg,People & Blogs,2015-12-09,http://www.youtube.com/channel/UCX9hgl0cGni5PL...,SKYES FAMILY,948000,205
55898,UC-x0MrgC8evQZ5CKC4RjGKg,Entertainment,2017-08-04,http://www.youtube.com/channel/UC-x0MrgC8evQZ5...,TRP Of Indian Serial...,44900,2235
55899,UCSbUX_gKMur5FPcTbH2L5mA,Entertainment,2010-09-23,http://www.youtube.com/channel/UCSbUX_gKMur5FP...,Galatta Tamil | க�...,2155567,17161
55900,UCKZGcrxRAhdUi58Mdr565mw,News & Politics,2012-07-27,http://www.youtube.com/channel/UCKZGcrxRAhdUi5...,African Diaspora New...,1100000,3576
55901,UC1rsLSbsz0Y2blrWDZ_-8Qw,Gaming,2012-02-06,http://www.youtube.com/channel/UC1rsLSbsz0Y2bl...,SpyCakes,1303752,1186
55902,UC5u2g4wA-PHxOzEvPfHKfSg,Entertainment,2016-12-22,http://www.youtube.com/channel/UC5u2g4wA-PHxOz...,IntelPlayz,234000,1158
55903,UCv7PqO2lW3eK16sQngYPwsA,Gaming,2015-09-24,http://www.youtube.com/channel/UCv7PqO2lW3eK16...,BunnyFuFuu,1620000,974


In [41]:
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channel_embedding.npy'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCAql2DyGU2un1Ei2nMYsqOA'], k= 300)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)

In [45]:
nearest_neighbors[:50]

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCAql2DyGU2un1Ei2nMYsqOA,News & Politics,2015-03-17,http://www.youtube.com/channel/UCAql2DyGU2un1E...,Donald J Trump,189000,222
1,UCqC9wT9TI5Va0yWG77-q3IA,News & Politics,2007-06-18,http://www.youtube.com/channel/UCqC9wT9TI5Va0y...,seattletimesdotcom,29100,3588
2,UCa2gbBXGeGh-Pdtugdg_qcQ,News & Politics,2012-01-26,http://www.youtube.com/channel/UCa2gbBXGeGh-Pd...,The Western Journal,82400,1154
3,UCjaeHZ3XKFe5U2pr3V7G2xQ,Entertainment,2007-01-20,http://www.youtube.com/channel/UCjaeHZ3XKFe5U2...,MAGAtruthChannel,43300,39
4,UCvQ2kejN6UkrEVhE5oiYwBw,News & Politics,2011-09-13,http://www.youtube.com/channel/UCvQ2kejN6UkrEV...,GOP War Room,42500,6496
5,UC8eE25pvmhz-nAMZATep-7g,People & Blogs,2016-05-18,http://www.youtube.com/channel/UC8eE25pvmhz-nA...,Conservative Tribune...,31100,1707
6,UCi4fcBVyo4CAnmdgXeO-NvA,News & Politics,2014-02-27,http://www.youtube.com/channel/UCi4fcBVyo4CAnm...,CBS Boston,63500,31634
7,UCP4BdwPKyYnfRLjSNfGAPRA,News & Politics,2009-05-16,http://www.youtube.com/channel/UCP4BdwPKyYnfRL...,Daily Caller,135000,1050
8,UCFjfrFJ7pYqC_t1gGfq3nng,News & Politics,2016-03-28,http://www.youtube.com/channel/UCFjfrFJ7pYqC_t...,POTUS Trump MAGA,27000,1705
9,UCTZzJjoML2sFpq4T7CpOz8Q,News & Politics,2016-10-17,http://www.youtube.com/channel/UCTZzJjoML2sFpq...,LIVE ON-AIR NEWS,78100,354


In [76]:
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channel_embedding.npy'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UC9pXxdNqCc2zjgRXSoowNNg'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

57004


Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UC9pXxdNqCc2zjgRXSoowNNg,News & Politics,2014-07-09,http://www.youtube.com/channel/UC9pXxdNqCc2zjg...,A Blue Dot In Texas,10600,646
1,UCqSR91bTcQyrSZpUdGpJy1w,News & Politics,2008-12-31,http://www.youtube.com/channel/UCqSR91bTcQyrSZ...,Julia Blog,10300,376
2,UCRPdcY4_Al8TrRtgkMzkrQQ,Entertainment,2014-03-21,http://www.youtube.com/channel/UCRPdcY4_Al8TrR...,Tony Dortie - 24/7 E...,98700,333
3,UCXEjC-1iUa1WhwrBp0eBBuw,Entertainment,2016-10-13,http://www.youtube.com/channel/UCXEjC-1iUa1Whw...,Be Less Stupid,52100,309
4,UCuDv5p8E-evaRSh542hDV5g,News & Politics,2015-06-16,http://www.youtube.com/channel/UCuDv5p8E-evaRS...,Robert Reich,149000,221
5,UCWCtoOXSwTcbVjOiK8iFGUg,People & Blogs,2015-06-10,http://www.youtube.com/channel/UCWCtoOXSwTcbVj...,Washington Watch,25000,164
6,UCwlHzGKZA-e2GeLMjIE51uw,News & Politics,2017-08-16,http://www.youtube.com/channel/UCwlHzGKZA-e2Ge...,Parody Project,50400,99
7,UCx_SjDi4CS5ALkWCS9ffldQ,News & Politics,2012-01-24,http://www.youtube.com/channel/UCx_SjDi4CS5ALk...,WGBH News,17700,5938
8,UCcRDioQprYoceT_3NhrTIAQ,Science & Technology,2013-11-11,http://www.youtube.com/channel/UCcRDioQprYoceT...,Brad Reed,13600,935
9,UC1Z1i5ClfrJhyu9z1hlmDkQ,News & Politics,2015-05-14,http://www.youtube.com/channel/UC1Z1i5ClfrJhyu...,Daily Kos,10800,878


In [77]:
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channel_embedding.npy'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCAuUUnT6oDeKwE6v1NGQxug'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

57004


Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCAuUUnT6oDeKwE6v1NGQxug,People & Blogs,2006-12-06,http://www.youtube.com/channel/UCAuUUnT6oDeKwE...,TED,14800000,3105
1,UCsT0YIqwnpJCM-mx7-gSA4Q,Nonprofits & Activism,2009-06-23,http://www.youtube.com/channel/UCsT0YIqwnpJCM-...,TEDx Talks,20700000,141627
2,UCvQECJukTDE2i6aCoMnS-Vg,Education,2006-10-01,http://www.youtube.com/channel/UCvQECJukTDE2i6...,Big Think,2650000,11135
3,UCHnyfMqiRRG1u-2MsSQLbXA,Education,2010-07-21,http://www.youtube.com/channel/UCHnyfMqiRRG1u-...,Veritasium,6370000,280
4,UCqnbDFdCpuN8CMEg0VuEBqA,News & Politics,2006-10-13,http://www.youtube.com/channel/UCqnbDFdCpuN8CM...,The New York Times,2150000,9619
5,UCpVm7bg6pXKo1Pr6k5kxG9A,Entertainment,2006-05-07,http://www.youtube.com/channel/UCpVm7bg6pXKo1P...,National Geographic,12300000,9260
6,UC7IcJI8PUf5Z3zKxnZvTBog,Education,2010-05-18,http://www.youtube.com/channel/UC7IcJI8PUf5Z3z...,The School of Life,4870000,724
7,UCLXo7UDZvByw2ixzpQCufnA,News & Politics,2014-03-04,http://www.youtube.com/channel/UCLXo7UDZvByw2i...,Vox,6570000,1048
8,UC3XTzVzaHQEd30rQbuvCtTQ,Entertainment,2014-03-18,http://www.youtube.com/channel/UC3XTzVzaHQEd30...,LastWeekTonight,7240000,286
9,UCX6b17PVsYBQ0ip5gyeme-Q,Education,2006-05-20,http://www.youtube.com/channel/UCX6b17PVsYBQ0i...,CrashCourse,9800000,1160


In [84]:
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channel_embedding.npy'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCwTkM6CvIsYFaFiMKIKCqHw'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

57004


Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCwTkM6CvIsYFaFiMKIKCqHw,Film and Animation,2005-11-27,http://www.youtube.com/channel/UCwTkM6CvIsYFaF...,James Bond 007,115000,407
1,UCwQJJVNbn_Tqze89xf83nhw,Entertainment,2007-09-12,http://www.youtube.com/channel/UCwQJJVNbn_Tqze...,The Asylum - Officia...,11200,210
2,UCFSILgKCKo35QYGz8Kob51g,Film and Animation,2008-01-03,http://www.youtube.com/channel/UCFSILgKCKo35QY...,StudiocanalUK,97200,1844
3,UCwSIJCMWZC5GDM59wj7pMsg,Entertainment,2006-05-21,http://www.youtube.com/channel/UCwSIJCMWZC5GDM...,Prime Video UK,53200,996
4,UCX8yUJVv_GCpBNw7yp1Y_4Q,Comedy,2006-10-23,http://www.youtube.com/channel/UCX8yUJVv_GCpBN...,Anthony Ingruber,22100,77
5,UCSi5779nMiFTbLm3prHaa1g,Film and Animation,2008-01-15,http://www.youtube.com/channel/UCSi5779nMiFTbL...,Roadshow Films,85400,2138
6,UC1AJxP0vC_yqPUtr2MQttRQ,People & Blogs,2008-07-27,http://www.youtube.com/channel/UC1AJxP0vC_yqPU...,20thCenturyFoxFilm,44900,477
7,UCi_iW2lqB3ADsJOobe_8yeg,Film and Animation,2011-09-14,http://www.youtube.com/channel/UCi_iW2lqB3ADsJ...,Paramount Pictures U...,468000,976
8,UC_yl-sQID1NrBwAcS1LRHmg,Comedy,2009-11-16,http://www.youtube.com/channel/UC_yl-sQID1NrBw...,Mandatory,39100,1195
9,UCruD_lL-5fmllpKMSL-yCyQ,Entertainment,2006-06-05,http://www.youtube.com/channel/UCruD_lL-5fmllp...,Sony Pictures Classi...,49800,412


In [80]:
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channel_embedding.npy'
nearest_neighbors_id = get_k_nearest_neighbors(path, dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], k= 20)
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

57004


Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCISF5OGuAtSLNF24TKTnXag,Film and Animation,2013-12-17,http://www.youtube.com/channel/UCISF5OGuAtSLNF...,Magpiepony,736000,304
1,UC8Wj98MR_oUHBpTjLsE3HuA,Film and Animation,2010-10-10,http://www.youtube.com/channel/UC8Wj98MR_oUHBp...,Pinkie Rose,247000,167
2,UCHKDtSVVEc686EW7jRSCpZA,Entertainment,2007-04-01,http://www.youtube.com/channel/UCHKDtSVVEc686E...,Scribbler Production...,327000,1767
3,UCcZjYWpM72s-1cinYfeSCag,People & Blogs,2010-08-05,http://www.youtube.com/channel/UCcZjYWpM72s-1c...,DisneyFanatic2364,205000,1026
4,UCg3bKR3Z0rCmUe81X06EcaA,Entertainment,2015-11-21,http://www.youtube.com/channel/UCg3bKR3Z0rCmUe...,Blank Slate,121000,408
5,UC94Z4HZJkhPm94YPH1GE3bw,Entertainment,2011-12-13,http://www.youtube.com/channel/UC94Z4HZJkhPm94...,EileMonty,504000,211
6,UCPflSzRRyzK2hhbVxNAnrhw,Film and Animation,2012-03-31,http://www.youtube.com/channel/UCPflSzRRyzK2hh...,Pinkie Pie,1229394,451
7,UChU79FLnClqlRSTSZe3EjNA,Film and Animation,2014-03-21,http://www.youtube.com/channel/UChU79FLnClqlRS...,Flutter525,858000,1297
8,UCcGuE4cmWhB4XLjSKA8eRFA,Entertainment,2014-08-30,http://www.youtube.com/channel/UCcGuE4cmWhB4XL...,WatchPony,117000,341
9,UCaRdKsgbUdMIp-9FwEsaokg,Film and Animation,2014-02-25,http://www.youtube.com/channel/UCaRdKsgbUdMIp-...,Ashley H,77200,500


### Channels selected over the whole comments dataset
We randomly choose 10 000 users over the dataset.
For each user, we then pick two channels at random in the set of channels this user commented in.

In [14]:
selected_users_rw = np.random.randint(227495283-1, size=3000)

In [5]:
for v1, v2 in selected_channels_rw:
    if v1 == v2:
        print('haha')

In [15]:
selected_channels_rw = np.random.randint(129616-1, size=(3000, 2))

In [12]:
1 % 2

1

In [16]:
# Load the channel tuple sparse matrix
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_for_word2vec.npz')
NB_SAMPLE = 3000

with open("/dlabdata1/youtube_large/jouven/channels_more_300/selected_users_word2vecf.pkl",'rb') as f:
     selected_users = pickle.load(f)
f.close()
selected_users = np.random.choice(selected_users, NB_SAMPLE, replace=False)

S = S[:, selected_users]
# Create and store channels tuples
channels_tuple = []
for i in range(S.shape[1]):
    idx = S[:, i].nonzero()
    idx = idx[0]
    
    selected_channels = np.random.choice(idx, 2, replace=False)
    channels_tuple.append((selected_channels[0], selected_channels[1]))
    
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_user_walk_known_users.pkl",'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()

S = scipy.sparse.load_npz('/dlabdata1/youtube_large/jouven/final_sparse_matrix/channels_more_300/sparse_matrix_word2vec_users_commented_geq_2_channels.npz')

S2 = S[:, selected_users_rw]

# Create and store channels tuples
channels_tuple = []
last = 0
for i in range(S2.shape[1]):
    idx = S2[:, i].nonzero()
    idx = idx[0]
    
    if i % 2 == 1:
        selected_channel = np.random.choice(idx, 1)
        channels_tuple.append((last, selected_channel))
    else:
        last = np.random.choice(idx, 1)
    
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_random_walk_modified.pkl",'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()

channels_tuple = []
for val in selected_channels_rw:
    channels_tuple.append((val[0], val[1]))
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_random_walk.pkl",'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()


# Sample NB_SAMPLE from the set of users
selected_users = np.random.randint(S.shape[1] - 1, size=NB_SAMPLE)
S = S[:, selected_users]

# Create and store channels tuples
channels_tuple = []
for i in range(S.shape[1]):
    idx = S[:, i].nonzero()
    idx = idx[0]
    
    selected_channels = np.random.choice(idx, 2, replace=False)
    channels_tuple.append((selected_channels[0], selected_channels[1]))
    
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_user_walk.pkl",'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()



In [6]:
with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_user_walk.pkl",'rb') as f:
     channels_tuple = pickle.load(f)
f.close()

#### Random walk

In [7]:
def get_random_walk(df_embedding):
    with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_random_walk.pkl",'rb') as f:
         random_walk_channels = pickle.load(f)
    f.close()
    random_walk_distance = 0
    for val in random_walk_channels:
        random_walk_distance += distance.euclidean(df_embedding.iloc[val[0]], df_embedding.iloc[val[1]])
    return random_walk_distance

In [14]:
def get_random_walk_modified(df_embedding):
    with open("/dlabdata1/youtube_large/jouven/channels_more_300/channels_tuple_random_walk_modified.pkl",'rb') as f:
         random_walk_channels = pickle.load(f)
    f.close()
    random_walk_distance = 0
    for val in random_walk_channels:
        random_walk_distance += distance.euclidean(df_embedding.iloc[val[0]], df_embedding.iloc[val[1]])
    return random_walk_distance

#### Compute metrics: users walk distance and relative nearest neighbor ranking 

In this section we want to measure the euclidian distance of a user walk compared to a random walk.

user walk: Euclidean distance in the embedding space between two randomly channels of a user.
random walk: Euclidean distance in the embedding space between two randomly channels.
position: Position of a channel taken from user u relatively of another channel taken from the same user in terms of its nearest neighbor ranking.


In [9]:
'''
Get the position of ref_channel relative to second_channel in terms of its nearest neighbors ranking.
PARAMETER:
    - ref_channel: The reference channel on which wwe compute it's k nearest neighbor
    - second_channel: The channel where we compute it's ranking relatively to ref_channel
    - dist: Euclidean distance between ref_channel and second_channel
    - index: annoy index
    - df_embedding: DataFrame representing the embedding space

RETURN: The position of second_channel relatively to ref_channel in terms of it's ranking

'''
def get_ranking_position_between_channels(ref_channel, second_channel, dist, index, df_embedding):
    
    nearest_neighbors_index = index.get_nns_by_item(ref_channel, len(df_embedding), search_k = 100000000)
    dist_k_th_nearest = distance.euclidean(df_embedding.iloc[ref_channel], 
                                           df_embedding.iloc[nearest_neighbors_index[len(nearest_neighbors_index)-1]])
    for i in range(0, len(nearest_neighbors_index)):
        if nearest_neighbors_index[i] == second_channel:
            return i
    

In [15]:
users_walk_tab = []
ranking_position_tab = []

len_random_set = len(channels_tuple)
len_embedding = len(channels_id)
 
files = ['/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_10/channel_embedding.npy']
for file in files: 
    print('file ', file)
    df_embedding = get_dataframe_in_embedding_space(file)
    n_comp = df_embedding.shape[1]
    print('n_comp ', n_comp)
    random_walk_distance = get_random_walk_modified(df_embedding)
        
    index = get_annoy_index(df_embedding)

    users_walk = 0
    ranking_position = 0

    for ref_channel, second_channel in channels_tuple:
        dist = distance.euclidean(df_embedding.iloc[ref_channel], df_embedding.iloc[second_channel])
        users_walk += dist
        ranking_position += get_ranking_position_between_channels(ref_channel, second_channel, dist, index, df_embedding)
    
    users_walk_tab.append(users_walk/random_walk_distance)
    ranking_position_tab.append(ranking_position/(len_random_set*len_embedding))


file  /dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_10/channel_embedding.npy
n_comp  200


In [16]:
users_walk_tab

[1.9730998994803428]

In [17]:
ranking_position_tab

[0.555226085771304]

In [11]:
users_walk_tab

[2.2101317966239815]

In [12]:
ranking_position_tab

[0.555226085771304]

In [None]:
'''
Plot the results obtained when computing the metrics
'''
import plotly.graph_objects as go

components_name = ['Features dim 50', 'Features dim 100', 'Features dim 200', 'Features dim 500']
colors = ['b', 'g,', 'r', 'c']

fig = go.Figure()

for i in range(len(components_name)):
    fig.add_trace(go.Scatter(
        x=ranking_position_tab,
        y=users_walk_tab,
        marker=dict(color=colors[i], size=12),
        mode="markers",
        name=components_name[i],
    ))

fig.update_layout(title="Gender Earnings Disparity",
                  xaxis_title="Ranking position",
                  yaxis_title="Users walk distance")

fig.show()