This notebook checks the behaviour of the embedding space

In [7]:
import numpy as np
import pandas as pd
import zstandard as zstd

import pickle
import random

from annoy import AnnoyIndex
from scipy.spatial import distance

In [8]:
'''
Retrieve the array obtained by apllying the dimentinality reductin algorithm
graph_matrix: SHAPE: (channels, n_comp)
'''
n_comp = 500
graph_matrix = np.load('../../../dlabdata1/youtube_large/jouven/graph_matrix_reduced_'+str(n_comp)+'.npz')
graph_matrix = graph_matrix['arr_0']
df = pd.DataFrame(graph_matrix)
df = df.rename(lambda x: 'dr'+str(x), axis='columns')
df.head()

Unnamed: 0,dr0,dr1,dr2,dr3,dr4,dr5,dr6,dr7,dr8,dr9,...,dr490,dr491,dr492,dr493,dr494,dr495,dr496,dr497,dr498,dr499
0,3.018157,2.509277,-1.370723,-1.205678,-2.077085,-0.748989,0.437611,2.537465,-1.731717,-0.114103,...,0.055535,0.139898,-0.440606,0.131203,0.164528,0.020788,-0.678321,-0.291248,-0.392943,0.481696
1,8.037677,4.613102,1.003205,0.260758,2.15802,-4.919136,1.621984,0.779602,2.23735,2.801618,...,-0.542192,-0.185954,0.173667,0.040427,0.47272,0.092969,-0.323724,-0.429245,-0.025467,-0.864688
2,21.643678,4.48647,17.017157,4.962999,5.291304,-7.874998,-0.042691,5.202206,5.295214,3.251302,...,2.204272,-0.736775,1.406568,-0.932924,1.431729,-0.790614,-1.471813,-0.479949,1.046505,0.862118
3,367.470984,-22.194127,-71.681107,-28.139755,-121.998349,-263.544879,40.496879,-23.376365,14.612321,31.48856,...,-5.095282,-12.182553,2.291161,5.818858,-0.325965,-1.839918,1.649609,0.640047,-5.05965,-6.026595
4,27.062345,3.521666,8.440126,2.053278,10.615795,-12.821902,-3.522644,-7.847089,-14.845779,20.498252,...,2.095842,-2.15751,-0.829279,-0.495257,0.104933,-1.049125,0.119824,1.17928,0.452845,-2.336437


In [9]:
'''
Now we have to map the index values into the channels_id thanks to the mapping dictionarray already created
'''
# Channels that are in set_crawler dataset and also in which the language is in english
with open('../../../dlabdata1/youtube_large/olam/channels_id.pickle', 'rb') as f:
    channels_id = pickle.load(f)
f.close()

# Dictionnary to map an integer corresponding to the column/row of the sparse matrix to the channel id.
channel_index_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_index_dict[ind] = channel_id
    
# Dictionnary to map the channel id to an integer corresponding to the column/row of the sparse matrix.
channel_id_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_id_dict[channel_id] = ind

### Find k closest channel using annoy library

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we should get similar channels if the embedding space is good.

In [61]:
np.array(df.iloc[0])

array([ 3.01815744e+00,  2.50927734e+00, -1.37072257e+00, -1.20567830e+00,
       -2.07708523e+00, -7.48989291e-01,  4.37611135e-01,  2.53746474e+00,
       -1.73171676e+00, -1.14103475e-01,  1.25726826e+00,  6.30444621e-01,
        6.11343915e-01,  7.83699124e-01, -4.19090279e-01,  4.66672389e-01,
       -3.89315289e-01, -1.06828125e+00,  1.43515491e+00,  4.38751007e-01,
        2.27017606e+00,  2.55525518e-01, -1.35139048e-01, -8.00284152e-01,
        3.15844197e-01,  1.49824716e-01,  1.85265243e+00,  1.06826949e+00,
       -5.54717077e-01, -4.24087640e-01,  6.90991313e-01, -2.07789112e-02,
        1.48008816e+00, -2.16634476e+00, -2.87603960e-01,  9.58513827e-02,
        2.57788137e-01,  2.29807060e-01,  9.82293950e-01,  3.46817879e-01,
       -1.05128779e+00, -2.53955763e-01, -1.37272143e+00, -1.21046569e-01,
       -5.11473669e-01,  2.28005661e-01, -1.83236064e-01, -8.55537883e-01,
       -4.85062577e-01, -6.51462422e-01, -1.96393892e-02, -4.05489259e-01,
       -5.29935650e-01,  

In [62]:
index = AnnoyIndex(n_comp, "euclidean")  # Length of item vector that will be indexed
df.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
index.build(100) # 10 trees
#index.save('../../../dlabdata1/youtube_large/jouven/annoy_index_10_trees.ann')

True

In [75]:
#random.seed(1)
item = random.randint(0, len(df))
print('item: ', item)
nearest_neighbors_index = index.get_nns_by_item(item, 20, search_k = len(df))
print(nearest_neighbors_index) # will find the 1000 nearest neighbors

item:  16543
[16543, 115042, 77163, 78164, 47750, 33212, 62731, 147383, 89464, 53533, 24819, 43031, 5458, 49837, 58287, 5848, 143352, 98347, 64534, 37307]


In [77]:
nearest_neighbors_id = [channel_index_dict[val] for val in nearest_neighbors_index]
nearest_neighbors_id

['UCPuuuhmMW7jh6roOrIV9yRw',
 'UCjpgDLE5LGOpzX2igCKAdSQ',
 'UC4NALVCmcmL5ntpV0thoH6w',
 'UCs9DW1cr_cf4Mpi6QFjam_w',
 'UCvbQvWnQWEVNciTqgQPc7Hg',
 'UCuM2xSE9ix0ncMxjx4Ggolw',
 'UC2kz8okargLDXEH5RZMIRvw',
 'UCs8EL6BMdxYx9K-wO7T2mIg',
 'UCl69ToPHGlOHUCegE2HJM1g',
 'UCjAij4NWCod80d70enm3tRw',
 'UCvTueU0tFCj3Ym_aPi6UmUw',
 'UCO5ujNeWRIwP4DbCZqZWcLw',
 'UCSQ4EcsdtS3YxG2oHLF6heg',
 'UCN_ebY0N506fwZIwud36mUg',
 'UCzOIlU2dIXOTiTlFDuA3PJg',
 'UC2q1qbHBqQJl-TL15LPT0dw',
 'UCN9xkhyjdN8QN5d1m2FIU8A',
 'UCK5X3f0fxO4YnVKVZP8p6hg',
 'UC34n5OYv8Z7PRP-mtkxkPwQ',
 'UCvnCXuh_zhm75EJ89qJ95Kw']

In [78]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [79]:
channelcrawler['category'].unique()

array(['Film and Animation', 'Entertainment', 'Music', 'Comedy', 'Gaming',
       'Science & Technology', 'Sports', 'Education', 'People & Blogs',
       'Nonprofits & Activism', 'Howto & Style', 'News & Politics',
       'Travel & Events', 'Autos & Vehicles', 'Pets & Animals', nan],
      dtype=object)

In [80]:
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCPuuuhmMW7jh6roOrIV9yRw,Music,2006-03-01,http://www.youtube.com/channel/UCPuuuhmMW7jh6r...,Grateful Dead,112434,157
1,UCjpgDLE5LGOpzX2igCKAdSQ,People & Blogs,2013-03-17,http://www.youtube.com/channel/UCjpgDLE5LGOpzX...,Bethany G,161000,740
2,UC4NALVCmcmL5ntpV0thoH6w,Music,2014-08-05,http://www.youtube.com/channel/UC4NALVCmcmL5nt...,LooLoo Kids - Nurser...,18500000,415
3,UCs9DW1cr_cf4Mpi6QFjam_w,Music,2013-02-19,http://www.youtube.com/channel/UCs9DW1cr_cf4Mp...,Yo Trane,61700,24
4,UCvbQvWnQWEVNciTqgQPc7Hg,Music,2013-03-23,http://www.youtube.com/channel/UCvbQvWnQWEVNci...,DjPurpleI5h,30700,1502
5,UCuM2xSE9ix0ncMxjx4Ggolw,Science & Technology,2011-06-22,http://www.youtube.com/channel/UCuM2xSE9ix0ncM...,Kevin Riazi,14700,63
6,UC2kz8okargLDXEH5RZMIRvw,Entertainment,2015-02-08,http://www.youtube.com/channel/UC2kz8okargLDXE...,Creative Newborns,11100,833
7,UCs8EL6BMdxYx9K-wO7T2mIg,Music,2015-05-04,http://www.youtube.com/channel/UCs8EL6BMdxYx9K...,Careless Whispers,14500,48
8,UCl69ToPHGlOHUCegE2HJM1g,Music,2010-03-20,http://www.youtube.com/channel/UCl69ToPHGlOHUC...,CHHEWANG,348000,48
9,UCjAij4NWCod80d70enm3tRw,Gaming,2017-03-21,http://www.youtube.com/channel/UCjAij4NWCod80d...,Atomic Heart,30600,15


### Comparative study between users walk and random walk

In this section we want to measure the euclidian distance of a user walk compared to a random walk.
user walk: All the channels visited by a user taken at random. We then compute the euclidian distance in the embedding space between the first and the last channel.
random walk: Channels taken at random and we measures their distances.

The result we are looking for is that the user walk should be smaller than the random walk.

In [74]:
len_random_set = 1000

#### Users walk

In [13]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [10]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


In [78]:
'''
Function to add new edge
    PARAMETERS:
        - graph_dict: dictionnary mapping the edge (tuple of channel indices) with the weight of that edge
        - user_edge_channel_id: new edge to be added in graph_dict
'''
def add_user(channels, first_channel, last_channel, idx):
    channels.append((channel_id_dict[first_channel], channel_id_dict[last_channel]))
    #print('user at index ' + str(idx) + ' has been added')
        
# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# parameters
channels = []
random_values = set(random.sample(range(3, 100000), k = len_random_set))
#print('index of users taken at random: ', random_values)
idx = 1
nb_elem_added = 0
user = 'author_id'
first_channel = ''
last_channel = ''
average_distance = 0


# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if len(line_split) == 9:
        if idx == 1:
            print(line_split)
            idx += 1

        else:
            if vid_to_channels.get(line_split[2]) in channels_id:
                corr_channel = vid_to_channels[line_split[2]]
                if line_split[0] == user:
                    last_channel = corr_channel
                    user_last = line_split[0]
                else:
                    if (idx in random_values) and (last_channel != ''):
                        add_user(channels, first_channel, last_channel, idx)
                        nb_elem_added += 1
                    elif idx in random_values:
                        idx -= 1
                    user_first = line_split[0]
                    first_channel = corr_channel
                    last_channel = ''
                    user = line_split[0]
                    idx += 1
                
    if nb_elem_added == len_random_set:
        for val in channels:
            average_distance += distance.euclidean(df.iloc[val[0]], df.iloc[1])
        average_distance /= len_random_set
        print('Average distance of the users walk is ', average_distance)
        break


['author_id', 'id', 'video_id', 'parent_id', 'crawled_at', 'likes', 'replies', 'author', 'content']
Average distance of the users walk is  578.2853395127695


#### Random walk

In [79]:
len_random_values = len(df)
random_values_1 = random.sample(range(len_random_values), k = len_random_set)
random_values_2 = random.sample(range(len_random_values), k = len_random_set)
random_walk_channels = list(zip(random_values_1, random_values_2))
#print('random channels chosen: ', list(random_values))

In [80]:
average_random_walk_distance = 0
for val in random_walk_channels:
    average_random_walk_distance += distance.euclidean(df.iloc[val[0]], df.iloc[1])
average_random_walk_distance /= len_random_set
print('Average distance of the random walk is ', average_random_walk_distance)

Average distance of the random walk is  498.32158976881726
