### Statistics on the number of comments per channel

In [1]:
import time
import pickle
import operator
import os
import sys

import zstandard as zstd
import pandas as pd
import numpy as np

from annoy import AnnoyIndex
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

scriptpath = "../../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

### Finding cultural axis in our channel embedding

In [2]:
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [3]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,Mägo de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOMÁS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [4]:
'''
Retrieve the array obtained by apllying the dimentionality reduction algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETER:
    - file_path: the path where the embedding graph is stored

RETURN: 
    - df: DataFrame representing the graph in the embedding space
'''
def get_dataframe_in_embedding_space(file_path):
    #graph_matrix = np.load(file_path)
    #graph_matrix = graph_matrix['arr_0']
    #df = pd.DataFrame(graph_matrix)
    #df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    #return df
    return pd.read_csv(file_path, compression='gzip')

In [5]:
'''
    Retrieve the array obtained by apllying the dimentinality reductin algorithm
    graph_matrix: SHAPE: (channels, n_comp)
    
    PARAMETERS:
        - df_embedding: DataFrame representing the channel embedding
        - n_comp: number of components to use after the dimentionalit reduction
        
    RETURN: The annoy index
    '''

def get_annoy_index(df_embedding):
    
    index = AnnoyIndex(df_embedding.shape[1], "euclidean")  # Length of item vector that will be indexed
    df_embedding.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(100) # 100 trees
    return index

In [6]:
def get_annoy_index_from_array(array, other_point):
    index = AnnoyIndex(len(array[0]), "euclidean")  # Length of item vector that will be indexed
    for ind, val in enumerate(array):
        index.add_item(ind, np.array(val))
    index.add_item(len(array), np.array(other_point))
    index.build(100) # 100 trees
    return index

In [7]:
'''
For the given ref_channel, compute it's k neirest neighbor and create pairs of channels between the found channel and ref_channel
PARAMETERS:
    - channels_pairs: table representing the pair of channel already computed
    - ref_channel: the channel on which we compute the neirest neighbor search
    - index: the annoy index to do the k nearest neighbor search
    - k: the number of neighbors 

'''
def create_pairs(channels_pairs, ref_channel, index, k):
    nearest_neighbors = index.get_nns_by_item(ref_channel, k)
    for neighbor_channel in nearest_neighbors:
        channels_pairs.append((ref_channel, neighbor_channel))

In [35]:
'''
Generate the set of all pairs of channels with their k neirest neighbors
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - k: the parameter of the nearest neighbor search
    - n_comp: the number of components after applying the dimensionality reduction
RETURN:
    - list of channels tuple 
'''
def channels_with_neighbors_pairs(df_embedding, k, n_comp, seed):
    channels_pairs = []
    index = get_annoy_index(df_embedding)
    
    for channel in range(len(df_embedding)):
        if not channel == seed[0] and not channel == seed[1]:
            create_pairs(channels_pairs, channel, index, k)
    return channels_pairs

In [23]:
'''
Creates the axis vector representing the desired cultural concept which is based on the seed pair.
PARAMETERS:
    - path: the path where the reducted matrix is saved
    - k: the number of neirest neighbor
    - seed: the seed pair representing the base of the axis
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - All the channels pairs ranked by the cosine similarity metric (from higher to lower)
'''

def compute_axis_vector_based_on_seed(path, k, seed, nb_selected_pairs):
    
    # DataFrame representing the embedding
    df_embedding = get_dataframe_in_embedding_space(path)
    
    n_comp = df_embedding.shape[1]

    channels_pairs = channels_with_neighbors_pairs(df_embedding, k, n_comp, seed)
    print(len(channels_pairs))
    vector_diff_channels_pairs = np.array([np.array(df_embedding.iloc[first_vector]) - np.array(df_embedding.iloc[second_vector]) for first_vector, second_vector in channels_pairs])
    vector_diff_seed = np.array(df_embedding.iloc[seed[0]]) - np.array(df_embedding.iloc[seed[1]])
    
    # compute cosine similarity score
    similarity_ranked = cosine_similarity(vector_diff_channels_pairs, vector_diff_seed.reshape(1, -1))
    dict_channel_similarity = {}
    for ind in range(len(channels_pairs)):
        dict_channel_similarity[channels_pairs[ind]] = similarity_ranked[ind]
    sorted_similarity_score = sorted(dict_channel_similarity.keys(), key=dict_channel_similarity.get, reverse = True)
    
    return cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs)
    
    

In [24]:
'''
The nb_selected_pairs-1 pairs are selected based on the cosine similarity score to end up with nb_pairs_selected 
pairs to create the axis (with the original seed pair).
To create the axis, the vector difference of all nb_pairs_selected are averaged together to obtain a single vector 
for the axis that robustly represents the desired cultural concept.
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - sorted_similarity_score: list of channel pairs ordered by their cosine similarity score
    - vector_diff_seed: vector difference between the seed pair
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - Vector for the axis that represents the desired cultural concept
'''

def cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs):
    cultural_concept_vectors = []
    
    cultural_concept_vectors.append(vector_diff_seed)
    
    selected_channels_pairs = np.array(sorted_similarity_score[:nb_selected_pairs-1])
    # Print the corresponding channels id
    for pair in selected_channels_pairs:
        print(channelcrawler[channelcrawler['channel_id'].apply(lambda channel_id: channel_id in [dict_ind_channel[pair[0]], dict_ind_channel[pair[1]]])])
   
    for channel_pair in selected_channels_pairs:
        cultural_concept_vectors.append(np.array(df_embedding.iloc[channel_pair[0]]) - np.array(df_embedding.iloc[channel_pair[1]]))
    cultural_concept_vectors = np.array(cultural_concept_vectors)
    return cultural_concept_vectors.mean(axis = 0)

#### Partisans axis: Democrats vs republican

In [17]:
#channelcrawler[channelcrawler['name'] == 'The Democrats']
# global movement of one united voice to stand against Trump and his administration.
channelcrawler[channelcrawler['name'] == 'A Blue Dot In Texas']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
128076,News & Politics,2014-07-09,http://www.youtube.com/channel/UC9pXxdNqCc2zjg...,A Blue Dot In Texas,10600,646,UC9pXxdNqCc2zjgRXSoowNNg


In [12]:
channelcrawler[channelcrawler['name'] == 'Donald J Trump']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
151623,News & Politics,2015-03-17,http://www.youtube.com/channel/UCAql2DyGU2un1E...,Donald J Trump,189000,222,UCAql2DyGU2un1Ei2nMYsqOA


In [25]:
k = 10
path = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/embedding0.csv.gz'
partisan_seed = (dict_channel_ind['UC9pXxdNqCc2zjgRXSoowNNg'], dict_channel_ind['UCAql2DyGU2un1Ei2nMYsqOA'])
nb_selected_pairs = 10

partisan_axis = compute_axis_vector_based_on_seed(path, k, partisan_seed, nb_selected_pairs)

1296150
                     category   join_date  \
10215   Nonprofits & Activism  2007-07-31   
137491        News & Politics  2007-01-22   

                                                     link  \
10215   http://www.youtube.com/channel/UCzp8QlVd_hDLfK...   
137491  http://www.youtube.com/channel/UC-XsHHcuGtyzQM...   

                           name  subscribers  videos                channel_id  
10215   The Heartland Instit...        19131    1260  UCzp8QlVd_hDLfK1LMLDu3dQ  
137491        Brexit Party MEPs        33500     392  UC-XsHHcuGtyzQMmuITIQCpg  
             category   join_date  \
128766  Entertainment  2014-07-10   
153624          Music  2010-06-08   

                                                     link             name  \
128766  http://www.youtube.com/channel/UC3NDCaHOWB6XyX...         ill phix   
153624  http://www.youtube.com/channel/UC_P2OEyemsRVFf...  philthyrichVEVO   

        subscribers  videos                channel_id  
128766        13050      1

#### Gender axis: men vs women

In [14]:
# Here you will find everything about: Men's Fashion, Hairstyle, Fitness & Health, Personal Vlogs and Tech.
channelcrawler[channelcrawler['name'] == 'Alex Costa']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
70695,Howto & Style,2011-09-29,http://www.youtube.com/channel/UCZyCposXwcyopa...,Alex Costa,2120000,543,UCZyCposXwcyopaACep44maQ


In [15]:
# vlog channel about make up, hair transformation, rings selections, ...
channelcrawler[channelcrawler['name'] == 'Shaaanxo']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
38863,Howto & Style,2009-01-18,http://www.youtube.com/channel/UCMpOz2KEfkSdd5...,Shaaanxo,3215340,1422,UCMpOz2KEfkSdd5JeIJh_fxw


In [28]:
k = 10
path = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/embedding0.csv.gz'
gender_seed = (dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'], dict_channel_ind['UCZyCposXwcyopaACep44maQ'])
nb_selected_pairs = 10

gender_axis = compute_axis_vector_based_on_seed(path, k, gender_seed, nb_selected_pairs)

1296150
            category   join_date  \
16510      Education  2015-01-25   
70695  Howto & Style  2011-09-29   

                                                    link        name  \
16510  http://www.youtube.com/channel/UCOFCwvhDoUvYcf...     V Shred   
70695  http://www.youtube.com/channel/UCZyCposXwcyopa...  Alex Costa   

       subscribers  videos                channel_id  
16510      1330000     414  UCOFCwvhDoUvYcfpD7RJKQwA  
70695      2120000     543  UCZyCposXwcyopaACep44maQ  
             category   join_date  \
70695   Howto & Style  2011-09-29   
81811  People & Blogs  2011-11-14   

                                                    link         name  \
70695  http://www.youtube.com/channel/UCZyCposXwcyopa...   Alex Costa   
81811  http://www.youtube.com/channel/UCU7J3bxynfGRz-...  Rob Lipsett   

       subscribers  videos                channel_id  
70695      2120000     543  UCZyCposXwcyopaACep44maQ  
81811       442912     432  UCU7J3bxynfGRz--EFZ9-Dvw  
    

#### Age axis: kids vs adult

In [30]:
# Kids Learning Tube educates kids through music and animation in a fun and unique approach to learning.
channelcrawler[channelcrawler['name'] == "Magpiepony"]
#channelcrawler[channelcrawler['name'] == 'TED-Ed']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
7,Film and Animation,2013-12-17,http://www.youtube.com/channel/UCISF5OGuAtSLNF...,Magpiepony,736000,304,UCISF5OGuAtSLNF24TKTnXag


In [31]:
channelcrawler[channelcrawler['name'] == 'James Bond 007']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
5561,Film and Animation,2005-11-27,http://www.youtube.com/channel/UCwTkM6CvIsYFaF...,James Bond 007,115000,407,UCwTkM6CvIsYFaFiMKIKCqHw


In [36]:
k = 10
path = '/dlabdata1/youtube_large/jouven/word2vec_pytorch/run_channels_more_300/CONTEXT_True_20_SUBSAMPLING_False_0.0043/models/embedding0.csv.gz'
age_seed = (dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_channel_ind['UCwTkM6CvIsYFaFiMKIKCqHw'])
nb_selected_pairs = 10

age_axis = compute_axis_vector_based_on_seed(path, k, age_seed, nb_selected_pairs)

1296140
                 category   join_date  \
5561   Film and Animation  2005-11-27   
61312       Entertainment  2013-07-26   

                                                    link            name  \
5561   http://www.youtube.com/channel/UCwTkM6CvIsYFaF...  James Bond 007   
61312  http://www.youtube.com/channel/UCOmQRii-h3rpG4...    TheLastPrime   

       subscribers  videos                channel_id  
5561        115000     407  UCwTkM6CvIsYFaFiMKIKCqHw  
61312        55100     558  UCOmQRii-h3rpG4Ruiw07ENQ  
                 category   join_date  \
5561   Film and Animation  2005-11-27   
77057           Education  2012-02-21   

                                                    link            name  \
5561   http://www.youtube.com/channel/UCwTkM6CvIsYFaF...  James Bond 007   
77057  http://www.youtube.com/channel/UCTjNwImtwynnj7...     JayVinFoong   

       subscribers  videos                channel_id  
5561        115000     407  UCwTkM6CvIsYFaFiMKIKCqHw  
77057      