### Statistics on the number of comments per channel

In [2]:
import time
import pickle
import operator
import sys
import os

import zstandard as zstd
import pandas as pd
import numpy as np

from annoy import AnnoyIndex
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

scriptpath = "../../"
sys.path.append(os.path.abspath(scriptpath))
from helpers.helpers_channels_more_300 import *

### Finding cultural axis in our channel embedding

In [3]:
# Selected channels and id-index mapping
dict_channel_ind, dict_ind_channel, channels_id = filtered_channels_index_id_mapping()

In [11]:
channelcrawler = pd.read_csv("/dlabdata1/youtube_large/channelcrawler.csv")
channelcrawler['channel_id'] = channelcrawler['link'].str.split('/').str[-1]
channelcrawler = channelcrawler[channelcrawler['channel_id'].apply(lambda row: row in channels_id)]
channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
5,Comedy,2017-01-15,http://www.youtube.com/channel/UCxJWPpPED-J24z...,Magog of Morskar,29400,158,UCxJWPpPED-J24znoKyKZYjg
6,Gaming,2017-05-06,http://www.youtube.com/channel/UCNNLaOkE-rcthx...,magoogala gaming,113000,106,UCNNLaOkE-rcthxNssSHET2A
7,Film and Animation,2013-12-17,http://www.youtube.com/channel/UCISF5OGuAtSLNF...,Magpiepony,736000,304,UCISF5OGuAtSLNF24TKTnXag


In [19]:
'''
Retrieve the array obtained by apllying the dimentionality reduction algorithm
graph_matrix: SHAPE: (channels, n_comp)

PARAMETER:
    - file_path: the path where the embedding graph is stored

RETURN: 
    - df: DataFrame representing the graph in the embedding space
'''
def get_dataframe_in_embedding_space(file_path):
    graph_matrix = np.load(file_path)
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'dr'+str(x), axis='columns')
    df['index'] = np.load('/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/channel_embeddingindex_ordering.npy')
    df = df.sort_values(by=['index'])
    return df.set_index('index')

In [5]:
'''
    Retrieve the array obtained by apllying the dimentinality reductin algorithm
    graph_matrix: SHAPE: (channels, n_comp)
    
    PARAMETERS:
        - df_embedding: DataFrame representing the channel embedding
        - n_comp: number of components to use after the dimentionalit reduction
        
    RETURN: The annoy index
    '''

def get_annoy_index(df_embedding, n_comp):
    
    index = AnnoyIndex(n_comp, "euclidean")  # Length of item vector that will be indexed
    df_embedding.apply(lambda row: index.add_item(int(row.name), np.array(row)), axis = 1)
    index.build(100) # 100 trees
    return index

In [6]:
'''
For the given ref_channel, compute it's k neirest neighbor and create pairs of channels between the found channel and ref_channel
PARAMETERS:
    - channels_pairs: table representing the pair of channel already computed
    - ref_channel: the channel on which we compute the neirest neighbor search
    - index: the annoy index to do the k nearest neighbor search
    - k: the number of neighbors 

'''
def create_pairs(channels_pairs, ref_channel, index, k):
    nearest_neighbors = index.get_nns_by_item(ref_channel, k)
    for neighbor_channel in nearest_neighbors:
        channels_pairs.append((ref_channel, neighbor_channel))

In [7]:
'''
Generate the set of all pairs of channels with their k neirest neighbors
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - k: the parameter of the nearest neighbor search
    - n_comp: the number of components after applying the dimensionality reduction
RETURN:
    - list of channels tuple 
'''
def channels_with_neighbors_pairs(df_embedding, k, n_comp, seed):
    channels_pairs = []
    index = get_annoy_index(df_embedding, n_comp)
    
    for channel in range(len(df_embedding)):
        if not channel == seed[0] and not channel == seed[1]:
            create_pairs(channels_pairs, channel, index, k)
    return channels_pairs

In [8]:
'''
Creates the axis vector representing the desired cultural concept which is based on the seed pair.
PARAMETERS:
    - path: the path where the reducted matrix is saved
    - k: the number of neirest neighbor
    - seed: the seed pair representing the base of the axis
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - All the channels pairs ranked by the cosine similarity metric (from higher to lower)
'''

def compute_axis_vector_based_on_seed(path, k, seed, nb_selected_pairs):
    
    # DataFrame representing the embedding
    df_embedding = get_dataframe_in_embedding_space(path)
    
    n_comp = df_embedding.shape[1]

    channels_pairs = channels_with_neighbors_pairs(df_embedding, k, n_comp, seed)

    vector_diff_channels_pairs = np.array([np.array(df_embedding.iloc[first_vector]) - np.array(df_embedding.iloc[second_vector]) for first_vector, second_vector in channels_pairs])
    vector_diff_seed = (np.array(df_embedding.iloc[seed[0]]) - np.array(df_embedding.iloc[seed[1]]))
    
    # compute cosine similarity score
    similarity_ranked = cosine_similarity(vector_diff_channels_pairs, vector_diff_seed.reshape(1, -1))
    dict_channel_similarity = {}
    for ind in range(len(channels_pairs)):
        dict_channel_similarity[channels_pairs[ind]] = similarity_ranked[ind]
    sorted_similarity_score = sorted(dict_channel_similarity.keys(), key=dict_channel_similarity.get, reverse = True)
    
    return cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs)
    
    

In [9]:
'''
The nb_selected_pairs-1 pairs are selected based on the cosine similarity score to end up with nb_pairs_selected 
pairs to create the axis (with the original seed pair).
To create the axis, the vector difference of all nb_pairs_selected are averaged together to obtain a single vector 
for the axis that robustly represents the desired cultural concept.
PARAMETERS:
    - df_embedding: DataFrame representing the channel embedding
    - sorted_similarity_score: list of channel pairs ordered by their cosine similarity score
    - vector_diff_seed: vector difference between the seed pair
    - nb_selected_pairs: number of selected pairs to create the axis
RETURN:
    - Vector for the axis that represents the desired cultural concept
'''

def cultural_concept_vector(df_embedding, sorted_similarity_score, vector_diff_seed, nb_selected_pairs):
    cultural_concept_vectors = []
    
    cultural_concept_vectors.append(vector_diff_seed)
    
    selected_channels_pairs = np.array(sorted_similarity_score[:nb_selected_pairs-1])
    # Print the corresponding channels id
    for pair in selected_channels_pairs:
        print(channelcrawler[channelcrawler['channel_id'].apply(lambda channel_id: channel_id in [dict_ind_channel[pair[0]], dict_ind_channel[pair[1]]])])
   
    for channel_pair in selected_channels_pairs:
        cultural_concept_vectors.append(np.array(df_embedding.iloc[channel_pair[0]]) - np.array(df_embedding.iloc[channel_pair[1]]))
    cultural_concept_vectors = np.array(cultural_concept_vectors)
    return cultural_concept_vectors.mean(axis = 0)

#### Partisans axis: Democrats vs republican

In [12]:
#channelcrawler[channelcrawler['name'] == 'The Democrats']
# global movement of one united voice to stand against Trump and his administration.
channelcrawler[channelcrawler['name'] == 'The Democrats']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
5362,News & Politics,2006-07-27,http://www.youtube.com/channel/UClkO4MArT2WKWj...,The Democrats,12600,510,UClkO4MArT2WKWj32YDD_-Ew


In [17]:
channelcrawler[channelcrawler['channel_id'] == 'UCLRYsOHrkk5qcIhtq033bLQ']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id


In [14]:
channelcrawler[channelcrawler['name'] == 'Donald J Trump']
#channelcrawler[channelcrawler['name'] == 'Liberal Democrats']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
151623,News & Politics,2015-03-17,http://www.youtube.com/channel/UCAql2DyGU2un1E...,Donald J Trump,189000,222,UCAql2DyGU2un1Ei2nMYsqOA


In [20]:
k = 30
path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channels_more_300/size_200_sub_0043_neg_35_threshold_20/channel_embedding.npy'
partisan_seed = (dict_channel_ind['UClkO4MArT2WKWj32YDD_-Ew'], dict_channel_ind['UCAql2DyGU2un1Ei2nMYsqOA'])
nb_selected_pairs = 10

partisan_axis = compute_axis_vector_based_on_seed(path, k, partisan_seed, nb_selected_pairs)

             category   join_date  \
131685  Howto & Style  2013-04-22   
149830  Entertainment  2015-03-30   

                                                     link           name  \
131685  http://www.youtube.com/channel/UCMYpyd5l-qep26...  Aneeqa Beauty   
149830  http://www.youtube.com/channel/UCB9V55-U4cZWcS...   Wilbert Ross   

        subscribers  videos                channel_id  
131685        15700     103  UCMYpyd5l-qep26Q1zRgE-yQ  
149830       210989      40  UCB9V55-U4cZWcSky65cuhCw  
       category   join_date  \
31468     Music  2016-01-21   
134771    Music  2012-08-13   

                                                     link          name  \
31468   http://www.youtube.com/channel/UCmNSoJCouaKT_c...    Skrt Video   
134771  http://www.youtube.com/channel/UCQFicwEyxv3P9Z...  SniizahMusic   

        subscribers  videos                channel_id  
31468         25200      17  UCmNSoJCouaKT_cgxipipzGw  
134771        55433     322  UCQFicwEyxv3P9ZkpglSZd0g  
   

#### Gender axis: men vs women

In [26]:
# Here you will find everything about: Men's Fashion, Hairstyle, Fitness & Health, Personal Vlogs and Tech.
channelcrawler[channelcrawler['name'] == 'Alex Costa']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
70695,Howto & Style,2011-09-29,http://www.youtube.com/channel/UCZyCposXwcyopa...,Alex Costa,2120000,543,UCZyCposXwcyopaACep44maQ


In [27]:
# vlog channel about make up, hair transformation, rings selections, ...
channelcrawler[channelcrawler['name'] == 'Shaaanxo']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
38863,Howto & Style,2009-01-18,http://www.youtube.com/channel/UCMpOz2KEfkSdd5...,Shaaanxo,3215340,1422,UCMpOz2KEfkSdd5JeIJh_fxw


In [21]:
k = 30
#path = '/dlabdata1/youtube_large/jouven/word2vecf_preprocessing/channel_embedding.npy'
gender_seed = (dict_channel_ind['UCZyCposXwcyopaACep44maQ'], dict_channel_ind['UCMpOz2KEfkSdd5JeIJh_fxw'])
nb_selected_pairs = 10

gender_axis = compute_axis_vector_based_on_seed(path, k, gender_seed, nb_selected_pairs)

              category   join_date  \
110842           Music  2014-04-14   
153979  People & Blogs  2014-12-26   

                                                     link           name  \
110842  http://www.youtube.com/channel/UCa_pp2zu1q971V...  Gaëtan Piolot   
153979  http://www.youtube.com/channel/UCchN_MaC3-cBvj...      It'sMisty   

        subscribers  videos                channel_id  
110842        24600      12  UCa_pp2zu1q971VM2yPzHEhg  
153979        19300      87  UCchN_MaC3-cBvjvejCiMtmw  
              category   join_date  \
90080   People & Blogs  2018-05-07   
157811   Howto & Style  2006-04-04   

                                                     link         name  \
90080   http://www.youtube.com/channel/UCaKE-yVmrJkmsO...         JEPH   
157811  http://www.youtube.com/channel/UCnDqSqP8Lbn0_K...  metaspencer   

        subscribers  videos                channel_id  
90080         14600      24  UCaKE-yVmrJkmsOXkvFTbHZg  
157811        25200     242  UCnDqSqP8

#### Age axis: kids vs adult

In [11]:
#channelcrawler[channelcrawler['name'] == "TED-Ed"]
channelcrawler[channelcrawler['name'] == "Magpiepony"]

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
7,Film and Animation,2013-12-17,http://www.youtube.com/channel/UCISF5OGuAtSLNF...,Magpiepony,736000,304,UCISF5OGuAtSLNF24TKTnXag


In [12]:
# Conferences/talk channel
channelcrawler[channelcrawler['name'] == 'James Bond 007']

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
5561,Film and Animation,2005-11-27,http://www.youtube.com/channel/UCwTkM6CvIsYFaF...,James Bond 007,115000,407,UCwTkM6CvIsYFaFiMKIKCqHw


In [22]:
k = 30
age_seed = (dict_channel_ind['UCISF5OGuAtSLNF24TKTnXag'], dict_channel_ind['UCwTkM6CvIsYFaFiMKIKCqHw'])
nb_selected_pairs = 10

age_axis = compute_axis_vector_based_on_seed(path, k, age_seed, nb_selected_pairs)

      category   join_date                                               link  \
46771   Sports  2009-12-05  http://www.youtube.com/channel/UCM4GEVa1Qi1CTd...   
54898    Music  2007-06-14  http://www.youtube.com/channel/UCsq3H3cTxvDU1M...   

                   name  subscribers  videos                channel_id  
46771  overtimeathletes       221000     716  UCM4GEVa1Qi1CTdJg0ljdoow  
54898       Dr. Peacock       319585     307  UCsq3H3cTxvDU1Mktk1DQVkA  
             category   join_date  \
7110    Entertainment  2018-04-07   
163921          Music  2016-09-12   

                                                     link            name  \
7110    http://www.youtube.com/channel/UC_QhMn47x9IS3v...  العاب سيمب�...   
163921  http://www.youtube.com/channel/UC3CQLAsPXQ04pN...     MikeReyesTv   

        subscribers  videos                channel_id  
7110         321000     242  UC_QhMn47x9IS3vpwQ27OhMA  
163921        13200      22  UC3CQLAsPXQ04pNJ8uxJY2Hw  
               category  