### Statistics on the number of comments per channel

In [1]:
import time
import pickle
import operator

import zstandard as zstd
import pandas as pd
import numpy as np

from helpers import *
from annoy import AnnoyIndex
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'helpers'

In [2]:
with open("/dlabdata1/youtube_large/jouven/channels_idx_to_nb_comments.pkl",'rb') as f:
     channels_idx_to_nb_comments = pickle.load(f)
f.close()

In [3]:
channels_idx_to_nb_comments = sorted(channels_idx_to_nb_comments.items(), key = operator.itemgetter(1))

In [4]:
comments = 0
for i in range(50000):
    comments += channels_idx_to_nb_comments[i][1]

In [17]:
comments

83797609

### Finding cultural axis in our channel embedding

In [5]:
dict_channel_ind, dict_ind_channel, channels_id = channels()

In [6]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [7]:
'''
Retrieve the array obtained by apllying the dimentionality reduction algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETER:
    - file_path: the path where the embedding graph is stored

RETURN: 
    - df: DataFrame representing the graph in the embedding space
'''
def get_dataframe_in_embedding_space(file_path):
    graph_matrix = np.load(file_path)
    graph_matrix = graph_matrix['arr_0']
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    return df

In [8]:
'''
    Retrieve the array obtained by apllying the dimentinality reductin algorithm
    graph_matrix: SHAPE: (channels, n_comp)
    
    PARAMETERS:
        - df_embedding: DataFrame representing the channel embedding
        - n_comp: number of components to use after the dimentionalit reduction
        
    RETURN: The annoy index
    '''

def get_annoy_index(df_embedding, n_comp):
    
    index = AnnoyIndex(n_comp, "euclidean")  # Length of item vector that will be indexed
    df_embedding.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(100) # 100 trees
    return index

In [9]:
'''
For the given ref_channel, compute it's k neirest neighbor and create pairs of channels between the found channel and ref_channel
PARAMETERS:
    - channels_pairs: table representing the pair of channel already computed
    - ref_channel: the channel on which we compute the neirest neighbor search
    - index: the annoy index to do the k nearest neighbor search
    - k: the number of neighbors 

'''
def create_pairs(channels_pairs, ref_channel, index, k):
    nearest_neighbors = index.get_nns_by_item(ref_channel, k)
    for neighbor_channel in nearest_neighbors:
        channels_pairs.append((ref_channel, neighbor_channel))

In [10]:
'''
Generate the set of all pairs of channels with their k neirest neighbors
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - k: the parameter of the nearest neighbor search
    - n_comp: the number of components after applying the dimensionality reduction
RETURN:
    - list of channels tuple 
'''
def channels_with_neighbors_pairs(df_embedding, k, n_comp):
    channels_pairs = []
    index = get_annoy_index(df_embedding, n_comp)
    
    for channel in range(len(df_embedding)):
        create_pairs(channels_pairs, channel, index, k)
    return channels_pairs

In [17]:
'''
Creates the axis vector representing the desired cultural concept which is based on the seed pair.
PARAMETERS:
    - path: the path where the reducted matrix is saved
    - k: the number of neirest neighbor
    - seed: the seed pair representing the base of the axis
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - All the channels pairs ranked by the cosine similarity metric (from higher to lower)
'''

def compute_axis_vector_based_on_seed(path, k, seed, nb_selected_pairs):
    
    # DataFrame representing the embedding
    df_embedding = get_dataframe_in_embedding_space(path)
    n_comp = df_embedding.shape[1]

    channels_pairs = channels_with_neighbors_pairs(df_embedding, k, n_comp)

    vector_diff_channels_pairs = np.array([np.array(df_embedding.iloc[first_vector]) - np.array(df_embedding.iloc[second_vector]) for first_vector, second_vector in channels_pairs])
    vector_diff_seed = (np.array(df_embedding.iloc[seed[0]]) - np.array(df_embedding.iloc[seed[1]]))
    
    # compute cosine similarity score
    similarity_ranked = cosine_similarity(vector_diff_channels_pairs, vector_diff_seed.reshape(1, -1))
    dict_channel_similarity = {}
    for ind in range(len(channels_pairs)):
        dict_channel_similarity[channels_pairs[ind]] = similarity_ranked[ind]
    sorted_similarity_score = sorted(dict_channel_similarity.keys(), key=dict_channel_similarity.get, reverse = True)
    
    return cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs)
    
    

In [12]:
'''
The nb_selected_pairs-1 pairs are selected based on the cosine similarity score to end up with nb_pairs_selected 
pairs to create the axis (with the original seed pair).
To create the axis, the vector difference of all nb_pairs_selected are averaged together to obtain a single vector 
for the axis that robustly represents the desired cultural concept.
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - sorted_similarity_score: list of channel pairs ordered by their cosine similarity score
    - vector_diff_seed: vector difference between the seed pair
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - Vector for the axis that represents the desired cultural concept
'''

def cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs):
    cultural_concept_vectors = []
    
    cultural_concept_vectors.append(vector_diff_seed)
    
    selected_channels_pairs = np.array(sorted_similarity_score[:nb_selected_pairs-1])
    # Print the corresponding channels id
    for pair in selected_channels_pairs:
        print(channelcrawler[channelcrawler['channel_id'].apply(lambda channel_id: channel_id in [dict_ind_channel[pair[0]], dict_ind_channel[pair[1]]])])
   
    for channel_pair in selected_channels_pairs:
        cultural_concept_vectors.append(np.array(df_embedding.iloc[channel_pair[0]]) - np.array(df_embedding.iloc[channel_pair[1]]))
    cultural_concept_vectors = np.array(cultural_concept_vectors)
    return cultural_concept_vectors.mean(axis = 0)

In [13]:
path = '/dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/reduced_pca_50.npz'
df_embedding = get_dataframe_in_embedding_space(path)
df_embedding.head()

Unnamed: 0,dr0,dr1,dr2,dr3,dr4,dr5,dr6,dr7,dr8,dr9,...,dr40,dr41,dr42,dr43,dr44,dr45,dr46,dr47,dr48,dr49
0,0.000248,-2.2e-05,-6.9e-05,-0.000161,5.3e-05,0.000173,5.1e-05,0.000171,1.6e-05,-2.2e-05,...,-6.5e-05,7.8e-05,-1.4e-05,-3.4e-05,4.4e-05,7e-06,4e-05,4.5e-05,-3.2e-05,-4.6e-05
1,0.000228,-1e-05,-8.7e-05,-0.000173,8.8e-05,0.000168,4.6e-05,0.000182,-2e-06,-3.3e-05,...,-8.1e-05,7.5e-05,-3e-05,-4.3e-05,2.4e-05,1.1e-05,3.4e-05,3.2e-05,-7.3e-05,-7.1e-05
2,0.00012,0.000152,-4.8e-05,-0.000176,-0.000109,0.000123,-0.000127,3e-06,1.9e-05,5.3e-05,...,-4.5e-05,0.000283,-3.6e-05,0.000192,2.2e-05,9e-06,0.000131,0.000259,2e-06,3.9e-05
3,0.00024,-7e-06,-6.1e-05,-0.000158,5.1e-05,0.000166,4.1e-05,0.000156,1.2e-05,-1.8e-05,...,-8.5e-05,8.5e-05,-1.1e-05,-1e-05,3.9e-05,4e-06,4.3e-05,5.3e-05,-4.6e-05,-4.1e-05
4,0.000227,-8e-06,-5e-05,-0.000182,3.3e-05,0.000144,-2e-06,0.000122,-8.4e-05,-4.8e-05,...,-7.7e-05,0.000107,-1.3e-05,-8e-06,2.9e-05,1.6e-05,7.3e-05,8.4e-05,-6.1e-05,-6.3e-05


#### Partisans axis: Democrats vs republican

In [4]:
channelcrawler[channelcrawler['name'] == 'The Democrats']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
5362,News & Politics,2006-07-27,http://www.youtube.com/channel/UClkO4MArT2WKWj...,The Democrats,12600,510,UClkO4MArT2WKWj32YDD_-Ew


In [5]:
channelcrawler[channelcrawler['name'] == 'Donald J Trump']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
151623,News & Politics,2015-03-17,http://www.youtube.com/channel/UCAql2DyGU2un1E...,Donald J Trump,189000,222,UCAql2DyGU2un1Ei2nMYsqOA


In [77]:
channel_dict['UClkO4MArT2WKWj32YDD_-Ew']

120967

In [78]:
channel_dict['UCAql2DyGU2un1Ei2nMYsqOA']

28823

In [100]:
channelcrawler[channelcrawler['channel_id'] == dict_ind_channel[0]]

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
16427,Travel & Events,2015-11-20,http://www.youtube.com/channel/UC--24Q3_ZQeFmg...,Winded Voyage Sailin...,17308,192,UC--24Q3_ZQeFmgJE-Um5QZQ


In [101]:
channelcrawler[channelcrawler['channel_id'] == dict_ind_channel[21121]]

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
104724,People & Blogs,2016-09-13,http://www.youtube.com/channel/UC7gFnfwpcsAuro...,Stephi's Vlog,14238,184,UC7gFnfwpcsAuroRwPt2rADQ


In [102]:
channelcrawler[channelcrawler['channel_id'] == dict_ind_channel[4529]]

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
153384,Howto & Style,2014-01-27,http://www.youtube.com/channel/UC0r9ECg_FPUM0Y...,First Lite Hunting A...,16500,154,UC0r9ECg_FPUM0Yj-TQgQX0Q


In [103]:
channelcrawler[channelcrawler['channel_id'] == dict_ind_channel[37339]]

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
126863,Education,2008-12-10,http://www.youtube.com/channel/UCEMS7GT52N5Kgx...,Maryland School of S...,31100,40,UCEMS7GT52N5KgxHdPUVv-Ag


In [128]:
k = 10
partisan_seed = (dict_channel_ind['UClkO4MArT2WKWj32YDD_-Ew'], dict_channel_ind['UCAql2DyGU2un1Ei2nMYsqOA'])
nb_selected_pairs = 10

partisan_axis = compute_axis_vector_based_on_seed(path, k, partisan_seed, nb_selected_pairs)

             category   join_date  \
118464  Entertainment  2014-03-24   
140606         Comedy  2017-07-15   

                                                     link             name  \
118464  http://www.youtube.com/channel/UCHQ0GRAK3w_s8D...    Sin Ful The P   
140606  http://www.youtube.com/channel/UCBue71h1ewSx3v...  Monty Woodgrain   

        subscribers  videos                channel_id  
118464        43200     332  UCHQ0GRAK3w_s8Dm7WRzbT8g  
140606        17900     673  UCBue71h1ewSx3vLeuBnjvCw  
             category   join_date  \
33414  People & Blogs  2009-07-31   
95515       Education  2016-07-07   

                                                    link              name  \
33414  http://www.youtube.com/channel/UC4Ya1ej073TMCV...          Cerebral   
95515  http://www.youtube.com/channel/UC5ksavsuuri0tJ...  The Greatest Cyn   

       subscribers  videos                channel_id  
33414        10300     122  UC4Ya1ej073TMCVgRUo_G_6A  
95515        28800     138  

#### Gender axis: men vs women

In [13]:
# Here you will find everything about: Men's Fashion, Hairstyle, Fitness & Health, Personal Vlogs and Tech.
channelcrawler[channelcrawler['name'] == 'Alex Costa']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
70695,Howto & Style,2011-09-29,http://www.youtube.com/channel/UCZyCposXwcyopa...,Alex Costa,2120000,543,UCZyCposXwcyopaACep44maQ


In [14]:
# vlog channel about make up, hair transformation, rings selections, ...
channelcrawler[channelcrawler['name'] == 'Shaaanxo']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
38863,Howto & Style,2009-01-18,http://www.youtube.com/channel/UCMpOz2KEfkSdd5...,Shaaanxo,3215340,1422,UCMpOz2KEfkSdd5JeIJh_fxw


In [18]:
k = 10
path = '/dlabdata1/youtube_large/jouven/channel_embedding/limited_normalized_50/reduced_pca_50.npz'
gender_seed = (dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'])
nb_selected_pairs = 10

gender_axis = compute_axis_vector_based_on_seed(path, k, gender_seed, nb_selected_pairs)

             category   join_date  \
64049   Howto & Style  2009-08-22   
121125  Howto & Style  2012-11-10   

                                                     link            name  \
64049   http://www.youtube.com/channel/UCOWNJMtHeyhwsO...  Izzy Parkhurst   
121125  http://www.youtube.com/channel/UCPFvgBqzykc8Zk...     Emily Grace   

        subscribers  videos                channel_id  
64049        112000     219  UCOWNJMtHeyhwsOSRHcahDCA  
121125       124000     312  UCPFvgBqzykc8ZkrVJFWN6UA  
             category   join_date  \
11633   Howto & Style  2008-05-13   
105135  Howto & Style  2010-08-11   

                                                     link            name  \
11633   http://www.youtube.com/channel/UCDn2NrHBB4iAKf...       ulovemegz   
105135  http://www.youtube.com/channel/UCIVM3-CW8t3DWD...  Jennie Jenkins   

        subscribers  videos                channel_id  
11633        511000     806  UCDn2NrHBB4iAKf2jMPqsClA  
105135       655000     724  UCI

#### Age axis: kids vs adult

In [23]:
# Kids Learning Tube educates kids through music and animation in a fun and unique approach to learning.
channelcrawler[channelcrawler['name'] == 'Kids Learning Tube']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
83811,Education,2015-01-26,http://www.youtube.com/channel/UC7EFWpvc1wYuUw...,Kids Learning Tube,539000,278,UC7EFWpvc1wYuUwrtZ_BLi9A


In [26]:
# Channel to learn how to do trading
channelcrawler[channelcrawler['name'] == 'The Trading Channel']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
45804,Education,2014-04-24,http://www.youtube.com/channel/UCGL9ubdGcvZh_d...,The Trading Channel,217872,197,UCGL9ubdGcvZh_dvSV2z1hoQ


In [27]:
k = 10
age_seed = (dict_channel_ind['UC7EFWpvc1wYuUwrtZ_BLi9A'], dict_channel_ind['UCGL9ubdGcvZh_dvSV2z1hoQ'])
nb_selected_pairs = 10

age_axis = compute_axis_vector_based_on_seed(path, k, age_seed, nb_selected_pairs)

             category   join_date  \
83811       Education  2015-01-26   
132201  Entertainment  2017-04-23   

                                                     link                name  \
83811   http://www.youtube.com/channel/UC7EFWpvc1wYuUw...  Kids Learning Tube   
132201  http://www.youtube.com/channel/UC0wK7BoBSLLTi0...            Editator   

        subscribers  videos                channel_id  
83811        539000     278  UC7EFWpvc1wYuUwrtZ_BLi9A  
132201        94600      15  UC0wK7BoBSLLTi0oWunrXRug  
            category   join_date  \
14081  Entertainment  2017-05-10   
83811      Education  2015-01-26   

                                                    link                name  \
14081  http://www.youtube.com/channel/UC7CyQGwsqZC2_j...      Riddle Channel   
83811  http://www.youtube.com/channel/UC7EFWpvc1wYuUw...  Kids Learning Tube   

       subscribers  videos                channel_id  
14081       183000      25  UC7CyQGwsqZC2_jXEGVECEdg  
83811       5390