### Statistics on the number of comments per channel

In [2]:
import time
import pickle
import operator
import os
import sys

import zstandard as zstd
import pandas as pd
import numpy as np

from annoy import AnnoyIndex
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

scriptpath = "../../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers import *

### Finding cultural axis in our channel embedding

In [7]:
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [8]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [9]:
'''
Retrieve the array obtained by apllying the dimentionality reduction algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETER:
    - file_path: the path where the embedding graph is stored

RETURN: 
    - df: DataFrame representing the graph in the embedding space
'''
def get_dataframe_in_embedding_space(file_path):
    graph_matrix = np.load(file_path)
    graph_matrix = graph_matrix['arr_0']
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    return df

In [10]:
'''
    Retrieve the array obtained by apllying the dimentinality reductin algorithm
    graph_matrix: SHAPE: (channels, n_comp)
    
    PARAMETERS:
        - df_embedding: DataFrame representing the channel embedding
        - n_comp: number of components to use after the dimentionalit reduction
        
    RETURN: The annoy index
    '''

def get_annoy_index(df_embedding):
    
    index = AnnoyIndex(df_embedding.shape[1], "euclidean")  # Length of item vector that will be indexed
    df_embedding.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(100) # 100 trees
    return index

In [11]:
def get_annoy_index_from_array(array, other_point):
    index = AnnoyIndex(len(array[0]), "euclidean")  # Length of item vector that will be indexed
    for ind, val in enumerate(array):
        index.add_item(ind, np.array(val))
    index.add_item(len(array), np.array(other_point))
    index.build(100) # 100 trees
    return index

In [12]:
'''
For the given ref_channel, compute it's k neirest neighbor and create pairs of channels between the found channel and ref_channel
PARAMETERS:
    - channels_pairs: table representing the pair of channel already computed
    - ref_channel: the channel on which we compute the neirest neighbor search
    - index: the annoy index to do the k nearest neighbor search
    - k: the number of neighbors 

'''
def create_pairs(channels_pairs, ref_channel, index, k):
    nearest_neighbors = index.get_nns_by_item(ref_channel, k)
    for neighbor_channel in nearest_neighbors:
        channels_pairs.append((ref_channel, neighbor_channel))

In [13]:
'''
Generate the set of all pairs of channels with their k neirest neighbors
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - k: the parameter of the nearest neighbor search
    - n_comp: the number of components after applying the dimensionality reduction
RETURN:
    - list of channels tuple 
'''
def channels_with_neighbors_pairs(df_embedding, k, n_comp):
    channels_pairs = []
    index = get_annoy_index(df_embedding)
    
    for channel in range(len(df_embedding)):
        create_pairs(channels_pairs, channel, index, k)
    return channels_pairs

In [14]:
'''
Creates the axis vector representing the desired cultural concept which is based on the seed pair.
PARAMETERS:
    - path: the path where the reducted matrix is saved
    - k: the number of neirest neighbor
    - seed: the seed pair representing the base of the axis
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - All the channels pairs ranked by the cosine similarity metric (from higher to lower)
'''

def compute_axis_vector_based_on_seed(path, k, seed, nb_selected_pairs):
    
    # DataFrame representing the embedding
    df_embedding = get_dataframe_in_embedding_space(path)
    n_comp = df_embedding.shape[1]

    channels_pairs = channels_with_neighbors_pairs(df_embedding, k, n_comp)

    vector_diff_channels_pairs = np.array([np.array(df_embedding.iloc[first_vector]) - np.array(df_embedding.iloc[second_vector]) for first_vector, second_vector in channels_pairs])
    vector_diff_seed = (np.array(df_embedding.iloc[seed[0]]) - np.array(df_embedding.iloc[seed[1]]))
    
    # compute cosine similarity score
    similarity_ranked = cosine_similarity(vector_diff_channels_pairs, vector_diff_seed.reshape(1, -1))
    dict_channel_similarity = {}
    for ind in range(len(channels_pairs)):
        dict_channel_similarity[channels_pairs[ind]] = similarity_ranked[ind]
    sorted_similarity_score = sorted(dict_channel_similarity.keys(), key=dict_channel_similarity.get, reverse = True)
    
    return cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs)
    
    #index_diff = get_annoy_index_from_array(vector_diff_channels_pairs, vector_diff_seed)
    #nearest_neighbors = index_diff.get_nns_by_item(len(vector_diff_channels_pairs), nb_selected_pairs-1)
    # Print the corresponding channels id
    #for neighbor in nearest_neighbors:
    #    print(channelcrawler[channelcrawler['channel_id'].apply(lambda channel_id: channel_id in [dict_ind_channel[channels_pairs[neighbor][0]], dict_ind_channel[channels_pairs[neighbor][1]]])])
   
    
    
    

In [15]:
'''
The nb_selected_pairs-1 pairs are selected based on the cosine similarity score to end up with nb_pairs_selected 
pairs to create the axis (with the original seed pair).
To create the axis, the vector difference of all nb_pairs_selected are averaged together to obtain a single vector 
for the axis that robustly represents the desired cultural concept.
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - sorted_similarity_score: list of channel pairs ordered by their cosine similarity score
    - vector_diff_seed: vector difference between the seed pair
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - Vector for the axis that represents the desired cultural concept
'''

def cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs):
    cultural_concept_vectors = []
    
    cultural_concept_vectors.append(vector_diff_seed)
    
    selected_channels_pairs = np.array(sorted_similarity_score[:nb_selected_pairs-1])
    for channel_pair in selected_channels_pairs:
        print(dict_ind_channel[channel_pair[0]], dict_ind_channel[channel_pair[1]])
    # Print the corresponding channels id
    for pair in selected_channels_pairs:
        print(channelcrawler[channelcrawler['channel_id'].apply(lambda channel_id: channel_id in [dict_ind_channel[pair[0]], dict_ind_channel[pair[1]]])])
   
    for channel_pair in selected_channels_pairs:
        cultural_concept_vectors.append(np.array(df_embedding.iloc[channel_pair[0]]) - np.array(df_embedding.iloc[channel_pair[1]]))
    cultural_concept_vectors = np.array(cultural_concept_vectors)
    return cultural_concept_vectors.mean(axis = 0)

In [13]:
path = '/dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/reduced_pca_50.npz'
df_embedding = get_dataframe_in_embedding_space(path)
df_embedding.head()

Unnamed: 0,dr0,dr1,dr2,dr3,dr4,dr5,dr6,dr7,dr8,dr9,...,dr40,dr41,dr42,dr43,dr44,dr45,dr46,dr47,dr48,dr49
0,0.000248,-2.2e-05,-6.9e-05,-0.000161,5.3e-05,0.000173,5.1e-05,0.000171,1.6e-05,-2.2e-05,...,-6.5e-05,7.8e-05,-1.4e-05,-3.4e-05,4.4e-05,7e-06,4e-05,4.5e-05,-3.2e-05,-4.6e-05
1,0.000228,-1e-05,-8.7e-05,-0.000173,8.8e-05,0.000168,4.6e-05,0.000182,-2e-06,-3.3e-05,...,-8.1e-05,7.5e-05,-3e-05,-4.3e-05,2.4e-05,1.1e-05,3.4e-05,3.2e-05,-7.3e-05,-7.1e-05
2,0.00012,0.000152,-4.8e-05,-0.000176,-0.000109,0.000123,-0.000127,3e-06,1.9e-05,5.3e-05,...,-4.5e-05,0.000283,-3.6e-05,0.000192,2.2e-05,9e-06,0.000131,0.000259,2e-06,3.9e-05
3,0.00024,-7e-06,-6.1e-05,-0.000158,5.1e-05,0.000166,4.1e-05,0.000156,1.2e-05,-1.8e-05,...,-8.5e-05,8.5e-05,-1.1e-05,-1e-05,3.9e-05,4e-06,4.3e-05,5.3e-05,-4.6e-05,-4.1e-05
4,0.000227,-8e-06,-5e-05,-0.000182,3.3e-05,0.000144,-2e-06,0.000122,-8.4e-05,-4.8e-05,...,-7.7e-05,0.000107,-1.3e-05,-8e-06,2.9e-05,1.6e-05,7.3e-05,8.4e-05,-6.1e-05,-6.3e-05


#### Partisans axis: Democrats vs republican

In [16]:
#channelcrawler[channelcrawler['name'] == 'The Democrats']
# global movement of one united voice to stand against Trump and his administration.
channelcrawler[channelcrawler['name'] == 'Liberty Hangout']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
125768,Entertainment,2015-03-06,http://www.youtube.com/channel/UCQMb7c66tJ7Si8...,Liberty Hangout,313000,235,UCQMb7c66tJ7Si8IrWHOgAPg


In [18]:
channelcrawler[channelcrawler['name'] == 'Justice Democrats']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id


In [23]:
k = 10
path = '/dlabdata1/youtube_large/jouven/channel_embedding/channels_by_channels_normalized_comments_more_10k/reduced_fpca_200.npz'
partisan_seed = (dict_channel_ind['UC9pXxdNqCc2zjgRXSoowNNg'], dict_channel_ind['UCAql2DyGU2un1Ei2nMYsqOA'])
nb_selected_pairs = 10

partisan_axis = compute_axis_vector_based_on_seed(path, k, partisan_seed, nb_selected_pairs)

UCCWWa1QhqNh8BHapTj3cBWQ UCB63zbGTv8JgJGdYCfS4yRg
UC7dkMoe_i-xfdkf0uMIMsWA UC98Zwfvjq12M1oi99Yqd78w
UC62KZJ1mShIQ-14rjzVYt9A UC8fKk4GVLHTR13VkrMQH1Tw
UCDWmp9u3xBGwFR1iEIeEyOg UCAgBUlIpzlJaE0693J3s97w
UC2vX57S7VeTU7wkhwONr9rw UC4h08ypfrtFGdgu-OBwAUDQ
UCAyrKoW31y5UcsRjh2ItvxQ UCAHCehFYe02Ihviho8D_ZcQ
UCDHrwVzgl-vZ14wWnN1LVjQ UCBqKcE4Q6XbqKVB5guxHL4g
UC2Lqmch-uOFvwsvFzyYtYnA UCAyrKoW31y5UcsRjh2ItvxQ
UCFjOi1ZpZVErr8EYxg8t1dQ UCB1o7_gbFp2PLsamWxFenBg
               category   join_date  \
120314  News & Politics  2018-11-01   
153946   People & Blogs  2016-12-18   

                                                     link              name  \
120314  http://www.youtube.com/channel/UCCWWa1QhqNh8BH...    Chandler Crump   
153946  http://www.youtube.com/channel/UCB63zbGTv8JgJG...  KirksNewsNetwork   

        subscribers  videos                channel_id  
120314        33800      83  UCCWWa1QhqNh8BHapTj3cBWQ  
153946        12200     617  UCB63zbGTv8JgJGdYCfS4yRg  
              category   j

#### Gender axis: men vs women

In [29]:
# Here you will find everything about: Men's Fashion, Hairstyle, Fitness & Health, Personal Vlogs and Tech.
channelcrawler[channelcrawler['name'] == 'Alex Costa']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
70695,Howto & Style,2011-09-29,http://www.youtube.com/channel/UCZyCposXwcyopa...,Alex Costa,2120000,543,UCZyCposXwcyopaACep44maQ


In [30]:
# vlog channel about make up, hair transformation, rings selections, ...
channelcrawler[channelcrawler['name'] == 'Shaaanxo']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
38863,Howto & Style,2009-01-18,http://www.youtube.com/channel/UCMpOz2KEfkSdd5...,Shaaanxo,3215340,1422,UCMpOz2KEfkSdd5JeIJh_fxw


In [76]:
k = 30
path = '/dlabdata1/youtube_large/jouven/channel_embedding/channels_by_channels_normalized_comments_more_10k/reduced_fpca_200.npz'
seed = (dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_channel_ind['UCZyCposXwcyopaACep44maQ'])
nb_selected_pairs = 10

gender_axis = compute_axis_vector_based_on_seed(path, k, gender_seed, nb_selected_pairs)

UCMpOz2KEfkSdd5JeIJh_fxw UCPRlGA2w7C_DVw-1ynolJYw
UCMpOz2KEfkSdd5JeIJh_fxw UCW6MKQtz-vrqpC7D7qJ3Baw
UCMpOz2KEfkSdd5JeIJh_fxw UCM3P_G21gOSVdepXrEFojIg
UCMpOz2KEfkSdd5JeIJh_fxw UCPG6A5tNaPfv2SRNW2beq5Q
UCMpOz2KEfkSdd5JeIJh_fxw UCVKFs0cesQUEGtn8gQmhbHw
UCMpOz2KEfkSdd5JeIJh_fxw UCLhLyKIZYf4lCcilMujzXAA
UCMpOz2KEfkSdd5JeIJh_fxw UCLF42C7y73FKA8ye_5Nn-Kw
UCMpOz2KEfkSdd5JeIJh_fxw UCQWy33JxT07WWPdGBIuToPw
UCMpOz2KEfkSdd5JeIJh_fxw UCJyfryZQsoN_ttLFe6vYKmw
             category   join_date  \
38863   Howto & Style  2009-01-18   
38864  People & Blogs  2011-07-19   

                                                    link            name  \
38863  http://www.youtube.com/channel/UCMpOz2KEfkSdd5...        Shaaanxo   
38864  http://www.youtube.com/channel/UCPRlGA2w7C_DVw...  Shaaanxo Vlogs   

       subscribers  videos                channel_id  
38863      3215340    1422  UCMpOz2KEfkSdd5JeIJh_fxw  
38864       555112     742  UCPRlGA2w7C_DVw-1ynolJYw  
            category   join_date  \
38863  H

#### Age axis: kids vs adult

In [102]:
# Kids Learning Tube educates kids through music and animation in a fun and unique approach to learning.
channelcrawler[channelcrawler['name'] == 'thebrainscoop']
#channelcrawler[channelcrawler['name'] == 'TED-Ed']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
141525,Education,2012-12-17,http://www.youtube.com/channel/UCkyfHZ6bY2Tjqb...,thebrainscoop,513000,215,UCkyfHZ6bY2TjqbJhiH8Y2QQ


In [94]:
# Channel for talks
channelcrawler[channelcrawler['name'] == 'TED']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
1100,People & Blogs,2006-12-06,http://www.youtube.com/channel/UCAuUUnT6oDeKwE...,TED,14800000,3105,UCAuUUnT6oDeKwE6v1NGQxug


In [99]:
k = 10
age_seed = (dict_channel_ind['UCsooa4yRKGN_zEE8iknghZA'], dict_channel_ind['UCAuUUnT6oDeKwE6v1NGQxug'])
nb_selected_pairs = 10

age_axis = compute_axis_vector_based_on_seed(path, k, age_seed, nb_selected_pairs)

KeyError: 'UCsooa4yRKGN_zEE8iknghZA'