In [None]:
import json
import pickle
import numpy as np
import pandas as pd
import random
random.seed(30)
import copy
import csv
from utils import Utils
from operator import itemgetter
from tweet import Tweet
from video import Video
import location
from scipy.stats import pearsonr, spearmanr, ttest_ind, mannwhitneyu, kruskal, ks_2samp, spearmanr, pearsonr, sem, t
import matplotlib.pyplot as plt
import csv

## [abo | gun | blm]
campaign = 'abo'

bin_size = 7
connection_type = 'followers'
year = 2018

util = Utils(campaign, bin_size, connection_type, year)

tweet = Tweet(util)

In [None]:
ea_tweets = tweet.getAvailableTweets(0.2)

pd.set_option('display.max_rows', None)
video_leanings_probs = tweet.assignVideoLeaningLabels(ea_tweets)
videos = pickle.load(open(tweet.videos_path, 'rb'))

recfluence = pd.read_csv('data/media_bias/recfluence.csv')

print(recfluence['CHANNEL_ID'].isna().sum(), recfluence['LR'].isna().sum(), len(recfluence))
#print(recfluence.head())
recfluence.dropna(subset=['CHANNEL_ID', 'LR'], inplace=True)
print(recfluence['CHANNEL_ID'].isna().sum(), recfluence['LR'].isna().sum(), len(recfluence))

recfluence = recfluence[['CHANNEL_ID', 'CHANNEL_TITLE', 'LR']]
#refluence.set_index('CHANNEL_ID')
print(recfluence.head())
print(recfluence.dtypes)

recfluence_channel_ids = set(recfluence['CHANNEL_ID'])

cid2vid_dict = {}
for vid in video_leanings_probs:
    cid = videos[vid]['_source']['snippet']['channelId']
    right_leaning_prob = video_leanings_probs[vid]['right']
    if cid not in cid2vid_dict:
        cid2vid_dict[cid] = []
        cid2vid_dict[cid].append(right_leaning_prob)
    else:
        cid2vid_dict[cid].append(right_leaning_prob)


#topic
print('#channels: {}, #videos: {}'.format(len(cid2vid_dict.keys()), sum([len(cid2vid_dict[cid]) for cid in cid2vid_dict])))
#recfluence.intersect(topic)
common_yt_channel_ids = recfluence_channel_ids.intersection(set(cid2vid_dict.keys()))
print('#channels from rec. n dataset: {}, #videos from rec. n dataset: {}'.format(len(common_yt_channel_ids), sum([len(cid2vid_dict[cid]) for cid in common_yt_channel_ids])))


inferred_leaning_probs = {'CHANNEL_ID': [], 'avg_cid_probs': [], 'num_videos_per_cid': []}
for cid in common_yt_channel_ids:
    inferred_leaning_probs['CHANNEL_ID'].append(cid)
    inferred_leaning_probs['avg_cid_probs'].append(np.mean(cid2vid_dict[cid]))
    inferred_leaning_probs['num_videos_per_cid'].append(len(cid2vid_dict[cid]))
inferred_leaning_probs = pd.DataFrame.from_dict(inferred_leaning_probs)
print(inferred_leaning_probs.head())
print(inferred_leaning_probs.dtypes)

## Unioin of both datasets
#df_union = recfluence.join(inferred_leaning_probs, how='left', on='CHANNEL_ID')
df_union = recfluence.merge(inferred_leaning_probs, left_on='CHANNEL_ID', right_on='CHANNEL_ID')
df_union.dropna(subset=['avg_cid_probs'], inplace=True)
df_union = df_union.set_index('CHANNEL_ID')
print(len(df_union))
print(df_union.head())

common_yt_video_ids = list(df_union.index)
#print(common_yt_video_ids)

#Plot Distributions
channel_political_ideologies = {}
for cid in cid2vid_dict:
    gt_leaning = None
    if cid in common_yt_video_ids:
        gt_leaning = str(df_union.loc[cid, 'LR'])
        channel_political_ideologies[cid] = gt_leaning
print('#cids with ground_truth: {}'.format(len(channel_political_ideologies)))

vid_probs = {'L':[], 'R':[], 'C':[]}
for cid in channel_political_ideologies:
    for prob in cid2vid_dict[cid]:
        vid_probs[channel_political_ideologies[cid]].append(prob)

import seaborn as sns
sns.set(font_scale = 1.2, rc={'figure.figsize':(5.5, 4)})
sns.set_style("white")
plt.grid(False)
ax = sns.distplot(vid_probs['L'], rug=True, hist=False, color='blue', axlabel='Leaning Score', label='Left', kde_kws=dict(linewidth=1))
sns.distplot(vid_probs['R'], rug=True, hist=False, color='red', label='Right', kde_kws=dict(linewidth=1))
sns.distplot(vid_probs['C'], rug=True, hist=False, color='orange', label='Center', kde_kws=dict(linewidth=1))
ax.set(ylabel='Density')
plt.legend(loc='upper left')
plt.show()

## Find cut-off points
from scipy import stats
l_vid_probs = np.array(vid_probs['L'])
r_vid_probs = np.array(vid_probs['R'])
n_vid_probs = np.array(vid_probs['C'])

l_q1 = np.quantile(l_vid_probs, 0.25)
l_q3 = np.quantile(l_vid_probs, 0.75)
l_iqr = l_q3 - l_q1
r_q1 = np.quantile(r_vid_probs, 0.25)
r_q3 = np.quantile(r_vid_probs, 0.75)
r_iqr = r_q3 - r_q1
n_q1 = np.quantile(n_vid_probs, 0.25)
n_q3 = np.quantile(n_vid_probs, 0.75)
n_iqr = n_q3 - n_q1

l_vid_probs = l_vid_probs[(l_vid_probs <= l_q3 + 1.5*l_iqr)]
r_vid_probs = r_vid_probs[(r_vid_probs >= r_q1 - 1.5*r_iqr)]
n_vid_probs = n_vid_probs[(n_vid_probs > n_q1 - 1.5*n_iqr) & (n_vid_probs < n_q3 + 1.5*n_iqr)]

bx = sns.distplot(l_vid_probs, rug=True, hist=False, color='blue', axlabel='Leaning Score', label='Left', kde_kws=dict(linewidth=1))
sns.distplot(r_vid_probs, rug=True, hist=False, color='red', label='Right', kde_kws=dict(linewidth=1))
sns.distplot(n_vid_probs, rug=True, hist=False, color='orange', label='Center', kde_kws=dict(linewidth=1))
bx.set(ylabel='Density')
plt.legend(loc='upper left')

p_l = float(len(l_vid_probs)) / (len(l_vid_probs) + len(r_vid_probs) + len(n_vid_probs))
p_r = float(len(r_vid_probs)) / (len(l_vid_probs) + len(r_vid_probs) + len(n_vid_probs))
p_n = float(len(n_vid_probs)) / (len(l_vid_probs) + len(r_vid_probs) + len(n_vid_probs))

dist_l = stats.norm(np.mean(l_vid_probs), np.std(l_vid_probs))
dist_r = stats.norm(np.mean(r_vid_probs), np.std(r_vid_probs))
dist_n = stats.norm(np.mean(n_vid_probs), np.std(n_vid_probs))

const = 0.0001

min_diff = np.inf
cut_off_left = 0.
for prob in np.linspace(0.4, 0.7, 1000):
    diff = np.abs(p_l*(dist_l.cdf(prob+const)-dist_l.cdf(prob-const)) - p_n*(dist_n.cdf(prob+const)-dist_n.cdf(prob-const)))
    if diff <= min_diff:
        min_diff = diff
        cut_off_left = prob

print(cut_off_left, min_diff)

min_diff = np.inf
cut_off_right = 0.
for prob in np.linspace(0.55, 0.85, 1000):
    diff = np.abs(p_r*(dist_r.cdf(prob+const)-dist_r.cdf(prob-const)) - p_n*(dist_n.cdf(prob+const)-dist_n.cdf(prob-const)))
    if diff <= min_diff:
        min_diff = diff
        cut_off_right = prob

print(cut_off_right, min_diff)


## count how many left, right and neutral video in total
left_predefined = 0
left_inferred = 0
right_predefined = 0
right_inferred = 0
neutral_predefined = 0
neutral_inferred = 0
for vid in video_leanings_probs:
    cid = videos[vid]['_source']['snippet']['channelId']
    if cid in channel_political_ideologies:
        if channel_political_ideologies[cid] == 'L':
            left_predefined+=1
        elif channel_political_ideologies[cid] == 'R':
            right_predefined+=1
        elif channel_political_ideologies[cid] == 'C':
            neutral_predefined+=1
    else:
        if video_leanings_probs[vid]['right'] < cut_off_left:
            left_inferred+=1
        elif video_leanings_probs[vid]['right'] > cut_off_right:
            right_inferred+=1
        else:
            neutral_inferred+=1

print('#videos (L predefined): {}'.format(left_predefined))
print('#videos (R predefined): {}'.format(right_predefined))
print('#videos (N predefined): {}'.format(neutral_predefined))
print('#videos with predefined leaning: {}'.format(left_predefined + right_predefined + neutral_predefined))
            
print('#videos (L inferred): {}'.format(left_inferred))
print('#videos (R inferred): {}'.format(right_inferred))
print('#videos (N inferred): {}'.format(neutral_inferred))
print('#videos with inferred leaning: {}'.format(left_inferred + right_inferred + neutral_inferred))
    