This notebook checks the behaviour of the embedding space

In [1]:
import numpy as np
import pandas as pd
import zstandard as zstd
import re

import pickle
import random
import glob

from annoy import AnnoyIndex
from scipy.spatial import distance

In [2]:
'''
Retrieve the array obtained by apllying the dimentionality reduction algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETER:
    - file_path: the path where the embedding graph is stored

RETURN: 
    - df: DataFrame representing the graph in the embedding space
'''
def get_dataframe_in_embedding_space(file_path):
    graph_matrix = np.load(file_path)
    graph_matrix = graph_matrix['arr_0']
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    return df

In [3]:
'''
Now we have to map the index values into the channels_id thanks to the mapping dictionarray already created
'''
# Channels that are in set_crawler dataset and also in which the language is in english
with open('/dlabdata1/youtube_large/olam/channels_id_filtered.pickle', 'rb') as f:
    channels_id = pickle.load(f)
f.close()
channels_id = sorted(channels_id)
# Dictionnary to map an integer corresponding to the column/row of the sparse matrix to the channel id.
channel_index_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_index_dict[ind] = channel_id
    
# Dictionnary to map the channel id to an integer corresponding to the column/row of the sparse matrix.
channel_id_dict = {}
for ind, channel_id in enumerate(channels_id):
    channel_id_dict[channel_id] = ind
channels_id = set(channels_id)

### Find k closest channel using annoy library

First to check how good is the embedding space, we are going to choose a channel and it's k closest channels in the embedding space. By looking at these channels in the YouTube website, we should get similar channels if the embedding space is good.

In [292]:
np.array(df.iloc[0])

array([3.01815743, 2.50921109])

In [293]:
index = AnnoyIndex(n_comp, "euclidean")  # Length of item vector that will be indexed
df.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
index.build(100) # 10 trees
#index.save('../../../dlabdata1/youtube_large/jouven/annoy_index_10_trees.ann')

True

In [294]:
#random.seed(1)
item = random.randint(0, len(df))
print('item: ', item)
nearest_neighbors_index = index.get_nns_by_item(item, 20, search_k = len(df))
print(nearest_neighbors_index) # will find the 1000 nearest neighbors

item:  99347
[99347, 126103, 156642, 116668, 4481, 149946, 107057, 81023, 126224, 55610, 20235, 46046, 114177, 52922, 90852, 21997, 96426, 55627, 120318, 3226]


In [295]:
nearest_neighbors_id = [channel_index_dict[val] for val in nearest_neighbors_index]
nearest_neighbors_id

['UCMXvZ5ki-b4X_wbHwsj7PZw',
 'UCQu9eR8S7dAm1fdXBCOrJ6A',
 'UCfoxf7UeCpFpBAguS3JwbPA',
 'UCzTTM7g6KJ1lFF9wuJCdvTg',
 'UCdYVjiqHCjpi2BR0PbTjmEg',
 'UChI8VwnZuH6vcu1j8ld1Czw',
 'UCMCiK6bN3PfdKvNnYJf_h2A',
 'UCy3lCSLz5_LldHqNSeO7uMQ',
 'UCR9ehrySGCcxFNg8gCA2rsQ',
 'UCtI9t9l037t9KanXNtBl6mA',
 'UCx3s3t5kpD4VMfJjDi5keXw',
 'UCUgxI74QWFYEwNgDCIYFl3Q',
 'UCMdHGrfjwVU3_Gq6Olk3sdA',
 'UCAiomfxZAbfkyRSDs9gsiXg',
 'UC6ePwIAVzpvlY_9x8mb1Z9A',
 'UCaUajKAl3cpGQ6KARpnz_3w',
 'UCuDcRrzB9aQde-6o8xEkJ7Q',
 'UCGGTXs9gvw2ARbhOr5UE1sA',
 'UCdtlhXq8cPkJ1evFkj9q5Qg',
 'UCz7eke4JGlbtc11_6mmA7ew']

In [296]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [297]:
channelcrawler['category'].unique()

array(['Film and Animation', 'Entertainment', 'Music', 'Comedy', 'Gaming',
       'Science & Technology', 'Sports', 'Education', 'People & Blogs',
       'Nonprofits & Activism', 'Howto & Style', 'News & Politics',
       'Travel & Events', 'Autos & Vehicles', 'Pets & Animals', nan],
      dtype=object)

In [298]:
nearest_neighbors = pd.DataFrame(nearest_neighbors_id, columns = ['channel_id']).merge(channelcrawler)
nearest_neighbors

Unnamed: 0,channel_id,category,join_date,link,name,subscribers,videos
0,UCMXvZ5ki-b4X_wbHwsj7PZw,Film and Animation,2011-11-11,http://www.youtube.com/channel/UCMXvZ5ki-b4X_w...,Panoots,166263,28
1,UCQu9eR8S7dAm1fdXBCOrJ6A,Gaming,2018-02-01,http://www.youtube.com/channel/UCQu9eR8S7dAm1f...,FIFA DAP,13600,75
2,UCfoxf7UeCpFpBAguS3JwbPA,Entertainment,2014-12-16,http://www.youtube.com/channel/UCfoxf7UeCpFpBA...,play with me,69287,19
3,UCzTTM7g6KJ1lFF9wuJCdvTg,Entertainment,2014-08-18,http://www.youtube.com/channel/UCzTTM7g6KJ1lFF...,Sensei Aishitemasu,40600,788
4,UCdYVjiqHCjpi2BR0PbTjmEg,Comedy,2006-03-07,http://www.youtube.com/channel/UCdYVjiqHCjpi2B...,MattG124,469000,608
5,UChI8VwnZuH6vcu1j8ld1Czw,Music,2009-05-12,http://www.youtube.com/channel/UChI8VwnZuH6vcu...,BelanovaVEVO,261721,35
6,UCMCiK6bN3PfdKvNnYJf_h2A,Comedy,2007-04-10,http://www.youtube.com/channel/UCMCiK6bN3PfdKv...,Se Joe,35900,770
7,UCy3lCSLz5_LldHqNSeO7uMQ,People & Blogs,2011-07-22,http://www.youtube.com/channel/UCy3lCSLz5_LldH...,brandon begin,47000,71
8,UCR9ehrySGCcxFNg8gCA2rsQ,Entertainment,2013-10-02,http://www.youtube.com/channel/UCR9ehrySGCcxFN...,Hermetic Kitten ASMR...,300000,530
9,UCtI9t9l037t9KanXNtBl6mA,Film and Animation,2011-09-19,http://www.youtube.com/channel/UCtI9t9l037t9Ka...,allspark2013,17000,195


### Channels selected over the whole comments dataset
We randomly choose 10 000 users over the dataset.
For each user, we then pick two channels at random in the set of channels this user commented in.

In [None]:
# Dictionnary mapping the video_id to the channel_id
vid_to_channels = pd.read_pickle("/dlabdata1/youtube_large/id_to_channel_mapping.pickle")

In [None]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''

    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode("utf-8", errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


In [2]:
def add_channels(user_c):
    return random.sample(user_c, 2)

# Adjust chunk_size as necessary -- defaults to 16,384 if not specific
reader = Zreader("/dlabdata1/youtube_large/youtube_comments.ndjson.zst", chunk_size=16384)

# parameters
len_random_set = 10001
nb_users = 534744094
nb = 1
user_channels = []
channels_tuple = []
threshold = (len_random_set*1.1) / nb_users
not_completed = False

user = ''

# Read each line from the reader
for line in reader.readlines():
    line_split = line.replace('"', '').split(',')
    if user != line_split[0]:
        if not_completed:
            if len(user_channels) >= 2:
                channels_tuple.append(tuple(add_channels(user_channels)))
                nb += 1
                not_completed = False
            user_channels = []
        else:
            user_channels = []
            if random.random() <= threshold:
                if len(line_split) == 9:
                    if vid_to_channels.get(line_split[2]) in channels_id:
                        corr_channel = vid_to_channels[line_split[2]]
                        user_channels.append(channel_id_dict[corr_channel])
                not_completed = True
        
    else:
        if len(line_split) == 9:
            if vid_to_channels.get(line_split[2]) in channels_id:
                corr_channel = vid_to_channels[line_split[2]]
                if channel_id_dict[corr_channel] not in set(user_channels):
                    user_channels.append(channel_id_dict[corr_channel])
    user = line_split[0]


    if nb == len_random_set:
        break

NameError: name 'Zreader' is not defined

In [379]:
with open("/dlabdata1/youtube_large/jouven/randomly_selected_channels.pkl",'wb') as f:
     pickle.dump(channels_tuple, f)
f.close()

#### Random walk

In [4]:
def get_random_walk(df_embedding):
    random_values_1 = random.sample(range(len_embedding), k = len_random_set)
    random_values_2 = random.sample(range(len_embedding), k = len_random_set)
    random_walk_channels = list(zip(random_values_1, random_values_2))

    random_walk_distance = 0
    for val in random_walk_channels:
        random_walk_distance += distance.euclidean(df_embedding.iloc[val[0]], df_embedding.iloc[val[1]])
    return random_walk_distance

#### Compute metrics: users walk distance and relative nearest neighbor ranking 

In this section we want to measure the euclidian distance of a user walk compared to a random walk.

user walk: Euclidean distance in the embedding space between two randomly channels of a user.
random walk: Euclidean distance in the embedding space between two randomly channels.
position: Position of a channel taken from user u relatively of another channel taken from the same user in terms of its nearest neighbor ranking.


In [22]:
# Take only 10000 elements from the 100000 elements channels tuple
#channels_tuple2 = random.sample(channels_tuple, 20000)

In [27]:
#channels_tuple = random.sample(channels_tuple3, 10000)

In [29]:
# Open the list of channels tuple in order to compute the user walk and the position
#with open("/dlabdata1/youtube_large/jouven/randomly_selected_channels_filtered.pkl", 'wb') as f:
#    pickle.dump(channels_tuple, f)
#f.close()

In [24]:
# Open the list of channels tuple in order to compute the user walk and the position
with open("/dlabdata1/youtube_large/jouven/randomly_selected_channels_filtered.pkl", 'rb') as f:
    channels_tuple = pickle.load(f)
f.close()

In [25]:
channels_tuple = random.sample(channels_tuple, 5000)

In [7]:
'''
Get the position of ref_channel relative to second_channel in terms of its nearest neighbors ranking.
PARAMETER:
    - ref_channel: The reference channel on which wwe compute it's k nearest neighbor
    - second_channel: The channel where we compute it's ranking relatively to ref_channel
    - dist: Euclidean distance between ref_channel and second_channel
    - index: annoy index
    - df_embedding: DataFrame representing the embedding space

RETURN: The position of second_channel relatively to ref_channel in terms of it's ranking

'''
def get_ranking_position_between_channels(ref_channel, second_channel, dist, index, df_embedding):
    #print('ref ', ref_channel)
    #print('second ', second_channel)
    #print('dist ', dist)
    
    # Number of nearest neighbor we are looking for
    
    
    # length of last iteration
    len_last_it = 0
    
    nearest_neighbors_index = index.get_nns_by_item(ref_channel, len(df_embedding), search_k = 100000000)
    dist_k_th_nearest = distance.euclidean(df_embedding.iloc[ref_channel], 
                                           df_embedding.iloc[nearest_neighbors_index[len(nearest_neighbors_index)-1]])
    for i in range(0, len(nearest_neighbors_index)):
        if nearest_neighbors_index[i] == second_channel:
            return i
    

In [8]:
'''
    Retrieve the array obtained by apllying the dimentinality reductin algorithm
    graph_matrix: SHAPE: (channels, n_comp)
    
    PARAMETERS:
        - df_embedding: DataFrame representing the graph in the embedding space
        - n_comp: number of components to use after the dimentionalit reduction
        
    RETURN: The annoy index
    '''

def get_annoy_index(df_embedding, n_comp):
    
    index = AnnoyIndex(n_comp, "euclidean")  # Length of item vector that will be indexed
    df_embedding.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(100) # 100 trees
    return index

In [26]:
users_walk_tab = []
ranking_position_tab = []

len_random_set = 5000
len_embedding = len(channels_id)

path = '/dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/'
files = glob.glob(path + '*.npz')   
for file in files: 
    print('file ', file)
    search_n_comp = file.split("/")
    print(search_n_comp[6])
    n_comp = int(re.findall('[0-9]+', search_n_comp[6])[0])
    print('n_comp ', n_comp)
    df_embedding = get_dataframe_in_embedding_space(file)
    random_walk_distance = get_random_walk(df_embedding)
        
    index = get_annoy_index(df_embedding, n_comp)

    users_walk = 0
    ranking_position = 0

    for ref_channel, second_channel in channels_tuple:
        dist = distance.euclidean(df_embedding.iloc[ref_channel], df_embedding.iloc[second_channel])
        users_walk += dist
        ranking_position += get_ranking_position_between_channels(ref_channel, second_channel, dist, index, df_embedding)
    
    users_walk_tab.append(users_walk/random_walk_distance)
    ranking_position_tab.append(ranking_position/(len(channels_tuple)*len(channels_id)))


file  /dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/reduced_pca_100.npz
reduced_pca_100.npz
n_comp  100
file  /dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/reduced_pca_50.npz
reduced_pca_50.npz
n_comp  50
file  /dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/reduced_pca_200.npz
reduced_pca_200.npz
n_comp  200


In [20]:
df_embedding.shape[0]

155930

In [27]:
users_walk_tab

[0.884401161727789, 0.9240368724033946, 0.937289456175224]

In [28]:
ranking_position_tab

[0.39859200025652536, 0.39952344128775735, 0.39865621368562815]

In [None]:
'''
Plot the results obtained when computing the metrics
'''
import plotly.graph_objects as go

components_name = ['Features dim 50', 'Features dim 100', 'Features dim 200', 'Features dim 500']
colors = ['b', 'g,', 'r', 'c']

fig = go.Figure()

for i in range(len(components_name)):
    fig.add_trace(go.Scatter(
        x=ranking_position_tab,
        y=users_walk_tab,
        marker=dict(color=colors[i], size=12),
        mode="markers",
        name=components_name[i],
    ))

fig.update_layout(title="Gender Earnings Disparity",
                  xaxis_title="Ranking position",
                  yaxis_title="Users walk distance")

fig.show()