In [None]:
import json
import pickle
import numpy as np
import pandas as pd
import gzip
import math
import csv
import datetime
import copy
import dateutil.parser
import random
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import pylab as pl
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, ttest_ind, mannwhitneyu, kruskal, ks_2samp
from statsmodels.distributions.empirical_distribution import ECDF
import location
from utils import Utils
from tweet_ea import Tweet
import re
import pingouin as pg

def exponent_fmt(x, pos):
    """ The two args are the value and tick position. """
    return '$10^{{{0:.0f}}}$'.format(x)

bin_size = 7
connection_type = 'followers'
year = 2018

util_abo = Utils('abo', bin_size, connection_type, year)
util_gun = Utils('gun', bin_size, connection_type, year)
util_blm = Utils('blm', bin_size, connection_type, year)

tweet_abo = Tweet(util_abo)
tweet_gun = Tweet(util_gun)
tweet_blm = Tweet(util_blm)


In [None]:
ea_tweets_abo = tweet_abo.getAvailableTweets(0.2)
print('ABO: {}'.format(len(ea_tweets_abo.keys())))

ea_tweets_gun = tweet_gun.getAvailableTweets(0.2)
print('GUN: {}'.format(len(ea_tweets_gun.keys())))

ea_tweets_blm = tweet_blm.getAvailableTweets(0.2)
print('BLM: {}'.format(len(ea_tweets_blm.keys())))


In [None]:
video_leanings_probs_abo = tweet_abo.assignVideoLeaningLabels(ea_tweets_abo)
vids_abo = tweet_abo.separateVideosByLeaning(video_leanings_probs_abo)
print(">>> ABO --> L: {}, N: {}, R: {}".format(len(vids_abo['L']), len(vids_abo['N']), len(vids_abo['R'])))

print(vids_abo)

video_leanings_probs_gun = tweet_gun.assignVideoLeaningLabels(ea_tweets_gun)
vids_gun = tweet_gun.separateVideosByLeaning(video_leanings_probs_gun)
print(">>> GUN --> L: {}, N: {}, R: {}".format(len(vids_gun['L']), len(vids_gun['N']), len(vids_gun['R'])))

video_leanings_probs_blm = tweet_blm.assignVideoLeaningLabels(ea_tweets_blm)
vids_blm = tweet_blm.separateVideosByLeaning(video_leanings_probs_blm)
print(">>> BLM --> L: {}, N: {}, R: {}".format(len(vids_blm['L']), len(vids_blm['N']), len(vids_blm['R'])))

## STRUCTURAL MEASURE OPERATIONS ##

In [None]:
structural_measures_abo = tweet_abo.getAllNetworkStructureMeasures(ea_tweets_abo)
structural_measures_gun = tweet_gun.getAllNetworkStructureMeasures(ea_tweets_gun)
structural_measures_blm = tweet_blm.getAllNetworkStructureMeasures(ea_tweets_blm)

In [None]:
measure_type = 'nw_density'

print(">>> ABO >>>")
measures_left_abo = [structural_measures_abo[vid][measure_type] for vid in vids_abo['L'] if (structural_measures_abo[vid][measure_type] != None and not math.isnan(structural_measures_abo[vid][measure_type]) and not math.isinf(structural_measures_abo[vid][measure_type]))]
measures_right_abo = [structural_measures_abo[vid][measure_type] for vid in vids_abo['R'] if (structural_measures_abo[vid][measure_type] != None and not math.isnan(structural_measures_abo[vid][measure_type]) and not math.isinf(structural_measures_abo[vid][measure_type]))]
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='less'))
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='two-sided'))
print(ks_2samp(measures_left_abo, measures_right_abo))
print(pg.mwu(measures_left_abo, measures_right_abo, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_abo), np.median(measures_left_abo)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_abo), np.median(measures_right_abo)))

print(">>> GUN >>>")
measures_left_gun = [structural_measures_gun[vid][measure_type] for vid in vids_gun['L'] if (structural_measures_gun[vid][measure_type] != None and not math.isnan(structural_measures_gun[vid][measure_type]) and not math.isinf(structural_measures_gun[vid][measure_type]))]
measures_right_gun = [structural_measures_gun[vid][measure_type] for vid in vids_gun['R'] if (structural_measures_gun[vid][measure_type] != None and not math.isnan(structural_measures_gun[vid][measure_type]) and not math.isinf(structural_measures_gun[vid][measure_type]))]
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='less'))
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='two-sided'))
print(ks_2samp(measures_left_gun, measures_right_gun))
print(pg.mwu(measures_left_gun, measures_right_gun, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_gun), np.median(measures_left_gun)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_gun), np.median(measures_right_gun)))

print(">>> BLM >>>")
measures_left_blm = [structural_measures_blm[vid][measure_type] for vid in vids_blm['L'] if (structural_measures_blm[vid][measure_type] != None and not math.isnan(structural_measures_blm[vid][measure_type]) and not math.isinf(structural_measures_blm[vid][measure_type]))]
measures_right_blm = [structural_measures_blm[vid][measure_type] for vid in vids_blm['R'] if (structural_measures_blm[vid][measure_type] != None and not math.isnan(structural_measures_blm[vid][measure_type]) and not math.isinf(structural_measures_blm[vid][measure_type]))]
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='less'))
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='two-sided'))
print(ks_2samp(measures_left_blm, measures_right_blm))
print(pg.mwu(measures_left_blm, measures_right_blm, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_blm), np.median(measures_left_blm)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_blm), np.median(measures_right_blm)))

leanings = ['Left']*len(measures_left_abo) + ['Right']*len(measures_right_abo) + ['Left']*len(measures_left_gun) + ['Right']*len(measures_right_gun) + ['Left']*len(measures_left_blm) + ['Right']*len(measures_right_blm) 
topics = ['Abortion']*(len(measures_left_abo)+len(measures_right_abo)) + ['Gun control']*(len(measures_left_gun)+len(measures_right_gun)) + ['BLM']*(len(measures_left_blm)+len(measures_right_blm))
measures = measures_left_abo + measures_right_abo + measures_left_gun + measures_right_gun + measures_left_blm + measures_right_blm


if measure_type == 'nw_size' or measure_type == 'nw_max_indegree':
    measures = np.array([max(value, 1e-3) for value in measures])
    measures = np.log10(measures)


if measure_type == 'nw_size':
    measure_type = 'Network size (Log)'
elif measure_type == 'nw_in_degree_centrality_gini':
    measure_type = 'Gini coef of indegree centrality'
    #measure_type = 'Gini indegree centrality'
elif measure_type == 'nw_closeness_centrality_gini':
    measure_type = 'Gini coef of closeness centrality'
    #measure_type = 'Gini closeness centrality'
elif measure_type == 'global_efficiency':
    measure_type = 'Global efficiency'
elif measure_type == 'nw_density':
    measure_type = 'Network density'

data_dict = {'Leaning': leanings, 'Topic': topics, measure_type: measures}
df = pd.DataFrame(data_dict)

#df.to_csv('siqi_plot/{}.csv'.format('nw_global_efficiency'), index=False)


'''
ax = sns.violinplot(x="day", y="total_bill", hue="sex", data=tips, 
                    palette="Set2", split=True, scale="count", inner="quartile")
'''

plt.rcParams['figure.dpi'] = 600

'''
rc={'axes.labelsize': 16, 'legend.fontsize': 16, 
    'axes.titlesize': 12, 'xtick.labelsize': 16, 'ytick.labelsize': 12}
'''
rc={'axes.labelsize': 16, 'legend.fontsize': 22,
    'axes.titlesize': 20, 'xtick.labelsize': 18, 'ytick.labelsize': 20}

#plt.rcParams.update(**rc)
sns.set(rc=rc)
sns.set_style(style='white')

#sns.set(font_scale = 1.1)
#colors = ["#7DA2DB", "#D17475"]
#customPalette = sns.set_palette(sns.color_palette(colors))
ax = sns.violinplot(x="Topic", y=measure_type, hue="Leaning", data=df, 
                    palette={"Right": "#e06666", "Left": "#6d9eeb"}, split=True, inner="quartile")


if measure_type == 'Gini indegree centrality' or measure_type == 'Gini closeness centrality':
    ax.set(ylim=(0, 1.1))

ax.set(xlabel=None)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])


if measure_type == 'Network size (Log)':
    ax.yaxis.set_major_formatter(FuncFormatter(exponent_fmt))

ax.legend(loc='lower center', frameon=False, fontsize=16)
#ax.spines['center'].set_visible(False)
#ax.spines['top'].set_visible(False)

#ax.get_legend().set_visible(False)

#plt.savefig("nw_indegree_centrality_gini.pdf", bbox_inches = 'tight', pad_inches = 0, dpi=600)


## TEMPORAL MEASURE OPERATIONS ##

In [None]:
temporal_measures_abo = tweet_abo.getAllTemporalMeasures(ea_tweets_abo)
temporal_measures_gun = tweet_gun.getAllTemporalMeasures(ea_tweets_gun)
temporal_measures_blm = tweet_blm.getAllTemporalMeasures(ea_tweets_blm)

In [None]:
measure_type = 'nw_temporal_diff_between_pairs_mean'

print(">>> ABO >>>")
measures_left_abo = [temporal_measures_abo[vid][measure_type] for vid in vids_abo['L']]
measures_right_abo = [temporal_measures_abo[vid][measure_type] for vid in vids_abo['R']]
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='less'))
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='two-sided'))
print(ks_2samp(measures_left_abo, measures_right_abo))
print(pg.mwu(measures_left_abo, measures_right_abo, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_abo), np.median(measures_left_abo)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_abo), np.median(measures_right_abo)))

print(">>> GUN >>>")
measures_left_gun = [temporal_measures_gun[vid][measure_type] for vid in vids_gun['L']]
measures_right_gun = [temporal_measures_gun[vid][measure_type] for vid in vids_gun['R']]
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='less'))
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='two-sided'))
print(ks_2samp(measures_left_gun, measures_right_gun))
print(pg.mwu(measures_left_gun, measures_right_gun, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_gun), np.median(measures_left_gun)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_gun), np.median(measures_right_gun)))

print(">>> BLM >>>")
measures_left_blm = [temporal_measures_blm[vid][measure_type] for vid in vids_blm['L']]
measures_right_blm = [temporal_measures_blm[vid][measure_type] for vid in vids_blm['R']]
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='less'))
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='two-sided'))
print(ks_2samp(measures_left_blm, measures_right_blm))
print(pg.mwu(measures_left_blm, measures_right_blm, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_blm), np.median(measures_left_blm)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_blm), np.median(measures_right_blm)))

leanings = ['Left']*len(measures_left_abo) + ['Right']*len(measures_right_abo) + ['Left']*len(measures_left_gun) + ['Right']*len(measures_right_gun) + ['Left']*len(measures_left_blm) + ['Right']*len(measures_right_blm) 
topics = ['Abortion']*(len(measures_left_abo)+len(measures_right_abo)) + ['Gun control']*(len(measures_left_gun)+len(measures_right_gun)) + ['BLM']*(len(measures_left_blm)+len(measures_right_blm))
measures = measures_left_abo + measures_right_abo + measures_left_gun + measures_right_gun + measures_left_blm + measures_right_blm

measures = np.array([max(value, 1e-3) for value in measures])
measures = np.log10(measures)

if measure_type == 'nw_temporal_diff_wrt_first_tweet_mean':
    measure_type = 'Time delay (Log)'
elif measure_type == 'nw_temporal_diff_between_pairs_mean':
    #measure_type = 'Lag b/w consecutive tweets (Log)'
    measure_type = 'Inter-arrival time (Log)'
elif measure_type == 'nw_life_time':
    measure_type = 'Lifetime (Log)'
elif measure_type == 'nw_temporal_diff_between_first_tweets_of_source_users_mean':
    measure_type = 'Lag b/w first tweets \n of source users (Log)'
elif measure_type == 'nw_temporal_diff_between_max_indegree_user':
    measure_type = 'Lag between the first tweets of source users (Log)'

data_dict = {'Leaning': leanings, 'Topic': topics, measure_type: measures}
df = pd.DataFrame(data_dict)

#df.to_csv('siqi_plot/{}.csv'.format('t_inter_arrival_time'), index=False)

plt.rcParams['figure.dpi'] = 600
'''
rc={'axes.labelsize': 16, 'legend.fontsize': 16, 
    'axes.titlesize': 12, 'xtick.labelsize': 16, 'ytick.labelsize': 12}
'''
rc={'axes.labelsize': 20, 'legend.fontsize': 22,
    'axes.titlesize': 22, 'xtick.labelsize': 18, 'ytick.labelsize': 20}

sns.set(rc=rc)
sns.set_style(style='white')

ax = sns.violinplot(x="Topic", y=measure_type, hue="Leaning", data=df, 
                    palette={"Right": "#e06666", "Left": "#6d9eeb"}, split=True, inner="quartile")

ax.set(xlabel=None)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])

ax.yaxis.set_major_formatter(FuncFormatter(exponent_fmt))

#ax.legend(loc='upper left', frameon=False, fontsize=16)
#ax.spines['right'].set_visible(False)
#ax.spines['top'].set_visible(False)

ax.get_legend().set_visible(False)

#plt.savefig("life_time.pdf", bbox_inches = 'tight', pad_inches = 0, dpi=600)


## ENGAGEMENT MEASURE OPERATIONS ##

In [None]:
engagement_measures_abo = tweet_abo.getAllEngagementMeasures(ea_tweets_abo)
engagement_measures_gun = tweet_gun.getAllEngagementMeasures(ea_tweets_gun)
engagement_measures_blm = tweet_blm.getAllEngagementMeasures(ea_tweets_blm)

In [None]:
measure_type = 'num_tweets'

print(">>> ABO >>>")
measures_left_abo = [engagement_measures_abo[vid][measure_type] for vid in vids_abo['L']]
measures_right_abo = [engagement_measures_abo[vid][measure_type] for vid in vids_abo['R']]
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='less'))
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='two-sided'))
print(ks_2samp(measures_left_abo, measures_right_abo))
print(pg.mwu(measures_left_abo, measures_right_abo, tail='one-sided'))

print(">>> GUN >>>")
measures_left_gun = [engagement_measures_gun[vid][measure_type] for vid in vids_gun['L']]
measures_right_gun = [engagement_measures_gun[vid][measure_type] for vid in vids_gun['R']]
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='less'))
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='two-sided'))
print(ks_2samp(measures_left_gun, measures_right_gun))
print(pg.mwu(measures_left_gun, measures_right_gun, tail='one-sided'))

print(">>> BLM >>>")
measures_left_blm = [engagement_measures_blm[vid][measure_type] for vid in vids_blm['L']]
measures_right_blm = [engagement_measures_blm[vid][measure_type] for vid in vids_blm['R']]
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='less'))
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='two-sided'))
print(ks_2samp(measures_left_blm, measures_right_blm))
print(pg.mwu(measures_left_blm, measures_right_blm, tail='one-sided'))

leanings = ['Left']*len(measures_left_abo) + ['Right']*len(measures_right_abo) + ['Left']*len(measures_left_gun) + ['Right']*len(measures_right_gun) + ['Left']*len(measures_left_blm) + ['Right']*len(measures_right_blm) 
topics = ['Abortion']*(len(measures_left_abo)+len(measures_right_abo)) + ['Gun control']*(len(measures_left_gun)+len(measures_right_gun)) + ['BLM']*(len(measures_left_blm)+len(measures_right_blm))
measures = measures_left_abo + measures_right_abo + measures_left_gun + measures_right_gun + measures_left_blm + measures_right_blm

measures = np.array([max(value, 1e-1) for value in measures])
measures = np.log10(measures)

if measure_type == 'num_tweets':
    measure_type = '#total tweets (Log)'
elif measure_type == 'num_original_tweets':
    measure_type = '#original tweets (Log)'
elif measure_type == 'num_quoted_tweets':
    measure_type = '#quoted tweets (Log)'
elif measure_type == 'num_retweets':
    measure_type = '#retweets (Log)'
elif measure_type == 'num_replies':
    measure_type = '#replies (Log)'

data_dict = {'Leaning': leanings, 'Topic': topics, measure_type: measures}
df = pd.DataFrame(data_dict)

plt.rcParams['figure.dpi'] = 600

rc={'axes.labelsize': 16, 'legend.fontsize': 16, 
    'axes.titlesize': 12, 'xtick.labelsize': 16, 'ytick.labelsize': 12}
sns.set(rc=rc)
sns.set_style(style='white')

ax = sns.violinplot(x="Topic", y=measure_type, hue="Leaning", data=df, 
                    palette={"Right": "#e06666", "Left": "#6d9eeb"}, split=True, inner="quartile")

ax.set(xlabel=None)

ax.set(ylim=(0, None))

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])

ax.yaxis.set_major_formatter(FuncFormatter(exponent_fmt))

ax.legend(loc='upper right', frameon=False, fontsize=16)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.get_legend().set_visible(False)

#plt.savefig("num_replies.pdf", bbox_inches = 'tight', pad_inches = 0, dpi=600)


## FOLLOWERS ANALYSIS

In [None]:
all_tweets_abo = pickle.load(open(tweet_abo.tweets_path, 'rb'))
users_abo = pickle.load(open(tweet_abo.users_path, 'rb'))
all_tweets_gun = pickle.load(open(tweet_gun.tweets_path, 'rb'))
users_gun = pickle.load(open(tweet_gun.users_path, 'rb'))
all_tweets_blm = pickle.load(open(tweet_blm.tweets_path, 'rb'))
users_blm = pickle.load(open(tweet_blm.users_path, 'rb'))

print(len(users_abo.keys()), len(users_gun.keys()), len(users_blm.keys()))

In [None]:
#measure_type: [max | median | mean]
measure_type = "max"

#user_type: [EA | All]
user_type = "All"
tweets_abo = ea_tweets_abo if user_type == "EA" else all_tweets_abo
tweets_gun = ea_tweets_gun if user_type == "EA" else all_tweets_gun
tweets_blm = ea_tweets_blm if user_type == "EA" else all_tweets_blm

followers_abo = {}
for tid in tweets_abo:
    uid = tweets_abo[tid]['_source']['user_id_str']
    num_followers = int(users_abo[uid]['_source']['followers_count'])
    original_video_ids = tweets_abo[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = tweets_abo[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = tweets_abo[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    for vid in video_ids:
        if vid not in followers_abo:
            followers_abo[vid] = {}
        followers_abo[vid][uid] = num_followers

followers_gun = {}
for tid in tweets_gun:
    uid = tweets_gun[tid]['_source']['user_id_str']
    num_followers = int(users_gun[uid]['_source']['followers_count'])
    original_video_ids = tweets_gun[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = tweets_gun[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = tweets_gun[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    for vid in video_ids:
        if vid not in followers_gun:
            followers_gun[vid] = {}
        followers_gun[vid][uid] = num_followers

followers_blm = {}
for tid in tweets_blm:
    uid = tweets_blm[tid]['_source']['user_id_str']
    num_followers = int(users_blm[uid]['_source']['followers_count'])
    original_video_ids = tweets_blm[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = tweets_blm[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = tweets_blm[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    for vid in video_ids:
        if vid not in followers_blm:
            followers_blm[vid] = {}
        followers_blm[vid][uid] = num_followers


print(">>> ABO >>>")
if measure_type == "max":
    measures_left_abo = [max(list(followers_abo[vid].values())) for vid in vids_abo['L']]
    measures_right_abo = [max(list(followers_abo[vid].values())) for vid in vids_abo['R']]
elif measure_type == "median":
    measures_left_abo = [np.median(list(followers_abo[vid].values())) for vid in vids_abo['L']]
    measures_right_abo = [np.median(list(followers_abo[vid].values())) for vid in vids_abo['R']]
elif measure_type == "mean":
    measures_left_abo = [np.mean(list(followers_abo[vid].values())) for vid in vids_abo['L']]
    measures_right_abo = [np.mean(list(followers_abo[vid].values())) for vid in vids_abo['R']]

print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='less'))
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='two-sided'))
print(ks_2samp(measures_left_abo, measures_right_abo))

print(">>> GUN >>>")
if measure_type == "max":
    measures_left_gun = [max(list(followers_gun[vid].values())) for vid in vids_gun['L']]
    measures_right_gun = [max(list(followers_gun[vid].values())) for vid in vids_gun['R']]
elif measure_type == "median":
    measures_left_gun = [np.median(list(followers_gun[vid].values())) for vid in vids_gun['L']]
    measures_right_gun = [np.median(list(followers_gun[vid].values())) for vid in vids_gun['R']]
elif measure_type == "mean":
    measures_left_gun = [np.mean(list(followers_gun[vid].values())) for vid in vids_gun['L']]
    measures_right_gun = [np.mean(list(followers_gun[vid].values())) for vid in vids_gun['R']]
    
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='less'))
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='two-sided'))
print(ks_2samp(measures_left_gun, measures_right_gun))

print(">>> BLM >>>")
if measure_type == "max":
    measures_left_blm = [max(list(followers_blm[vid].values())) for vid in vids_blm['L']]
    measures_right_blm = [max(list(followers_blm[vid].values())) for vid in vids_blm['R']]
elif measure_type == "median":
    measures_left_blm = [np.median(list(followers_blm[vid].values())) for vid in vids_blm['L']]
    measures_right_blm = [np.median(list(followers_blm[vid].values())) for vid in vids_blm['R']]
elif measure_type == "mean":
    measures_left_blm = [np.mean(list(followers_blm[vid].values())) for vid in vids_blm['L']]
    measures_right_blm = [np.mean(list(followers_blm[vid].values())) for vid in vids_blm['R']]
    
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='less'))
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='two-sided'))
print(ks_2samp(measures_left_blm, measures_right_blm))

leanings = ['Left']*len(measures_left_abo) + ['Right']*len(measures_right_abo) + ['Left']*len(measures_left_gun) + ['Right']*len(measures_right_gun) + ['Left']*len(measures_left_blm) + ['Right']*len(measures_right_blm) 
topics = ['Abortion']*(len(measures_left_abo)+len(measures_right_abo)) + ['Gun control']*(len(measures_left_gun)+len(measures_right_gun)) + ['BLM']*(len(measures_left_blm)+len(measures_right_blm))
measures = measures_left_abo + measures_right_abo + measures_left_gun + measures_right_gun + measures_left_blm + measures_right_blm

measures = np.array([max(value, 1e-3) for value in measures])
measures = np.log10(measures)


if measure_type == 'max':
    measure_type = '#Followers (Max.)'
elif measure_type == 'median':
    measure_type = '#Followers (Median)'
elif measure_type == 'mean':
    measure_type = '#Followers (Mean)'

data_dict = {'Leaning': leanings, 'Topic': topics, measure_type: measures}
df = pd.DataFrame(data_dict)

plt.rcParams['figure.dpi'] = 600

rc={'axes.labelsize': 16, 'legend.fontsize': 16, 
    'axes.titlesize': 12, 'xtick.labelsize': 16, 'ytick.labelsize': 12}
sns.set(rc=rc)
sns.set_style(style='white')

ax = sns.violinplot(x="Topic", y=measure_type, hue="Leaning", data=df, 
                    palette={"Right": "#e06666", "Left": "#6d9eeb"}, split=True, inner="quartile")

ax.set(xlabel=None)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])

ax.yaxis.set_major_formatter(FuncFormatter(exponent_fmt))

ax.legend(loc='upper right', frameon=False, fontsize=11)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

#plt.savefig("max_followers.pdf", bbox_inches = 'tight', pad_inches = 0, dpi=600)



## VIDEO POLARITY, INTENSITY, DIVISIVENESS and POPULARITY ANALYSIS ##

In [None]:
measure_type = "fraction_of_likes"

video_first_share_time_abo = {}
for tid in ea_tweets_abo:
    timestamp = int(ea_tweets_abo[tid]['_source']['timestamp_ms'])
    original_video_ids = ea_tweets_abo[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = ea_tweets_abo[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = ea_tweets_abo[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    for vid in video_ids:
        if vid not in video_first_share_time_abo:
            video_first_share_time_abo[vid] = timestamp
        else:
            if timestamp < video_first_share_time_abo[vid]:
                video_first_share_time_abo[vid] = timestamp

video_first_share_time_gun = {}
for tid in ea_tweets_gun:
    timestamp = int(ea_tweets_gun[tid]['_source']['timestamp_ms'])
    original_video_ids = ea_tweets_gun[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = ea_tweets_gun[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = ea_tweets_gun[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    for vid in video_ids:
        if vid not in video_first_share_time_gun:
            video_first_share_time_gun[vid] = timestamp
        else:
            if timestamp < video_first_share_time_gun[vid]:
                video_first_share_time_gun[vid] = timestamp

video_first_share_time_blm = {}
for tid in ea_tweets_blm:
    timestamp = int(ea_tweets_blm[tid]['_source']['timestamp_ms'])
    original_video_ids = ea_tweets_blm[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = ea_tweets_blm[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = ea_tweets_blm[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    for vid in video_ids:
        if vid not in video_first_share_time_blm:
            video_first_share_time_blm[vid] = timestamp
        else:
            if timestamp < video_first_share_time_blm[vid]:
                video_first_share_time_blm[vid] = timestamp

videos_abo = pickle.load(open(tweet_abo.videos_path, 'rb'))
video_props_abo = {}
for vid in video_leanings_probs_abo:
    video_props_abo[vid] = {}
    like_count = int(videos_abo[vid]['_source']['statistics']['likeCount'])
    dislike_count = int(videos_abo[vid]['_source']['statistics']['dislikeCount'])
    view_count = int(videos_abo[vid]['_source']['statistics']['viewCount'])
    first_share_time = video_first_share_time_abo[vid]
    polarity, intensity, hostility, popularity = tweet_abo.util.calculateVideoScores(like_count, dislike_count, view_count, first_share_time)
    video_props_abo[vid]['like_count'] = like_count
    video_props_abo[vid]['dislike_count'] = dislike_count
    video_props_abo[vid]['view_count'] = view_count
    video_props_abo[vid]['polarity'] = polarity
    video_props_abo[vid]['intensity'] = intensity
    video_props_abo[vid]['hostility'] = hostility
    video_props_abo[vid]['popularity'] = popularity
    video_props_abo[vid]['fraction_of_likes'] = (1 - (hostility+0.5))

videos_gun = pickle.load(open(tweet_gun.all_videos_from_annotated_videos_path, 'rb'))
video_props_gun = {}
for vid in video_leanings_probs_gun:
    video_props_gun[vid] = {}
    like_count = int(videos_gun[vid]['_source']['statistics']['likeCount'])
    dislike_count = int(videos_gun[vid]['_source']['statistics']['dislikeCount'])
    view_count = int(videos_gun[vid]['_source']['statistics']['viewCount'])
    first_share_time = video_first_share_time_gun[vid]
    polarity, intensity, hostility, popularity = tweet_gun.util.calculateVideoScores(like_count, dislike_count, view_count, first_share_time)
    video_props_gun[vid]['like_count'] = like_count
    video_props_gun[vid]['dislike_count'] = dislike_count
    video_props_gun[vid]['view_count'] = view_count
    video_props_gun[vid]['polarity'] = polarity
    video_props_gun[vid]['intensity'] = intensity
    video_props_gun[vid]['hostility'] = hostility
    video_props_gun[vid]['popularity'] = popularity
    video_props_gun[vid]['fraction_of_likes'] = (1 - (hostility+0.5))

videos_blm = pickle.load(open(tweet_blm.all_videos_from_annotated_videos_path, 'rb'))
video_props_blm = {}
for vid in video_leanings_probs_blm:
    video_props_blm[vid] = {}
    like_count = int(videos_blm[vid]['_source']['statistics']['likeCount'])
    dislike_count = int(videos_blm[vid]['_source']['statistics']['dislikeCount'])
    view_count = int(videos_blm[vid]['_source']['statistics']['viewCount'])
    first_share_time = video_first_share_time_blm[vid]
    polarity, intensity, hostility, popularity = tweet_blm.util.calculateVideoScores(like_count, dislike_count, view_count, first_share_time)
    video_props_blm[vid]['like_count'] = like_count
    video_props_blm[vid]['dislike_count'] = dislike_count
    video_props_blm[vid]['view_count'] = view_count
    video_props_blm[vid]['polarity'] = polarity
    video_props_blm[vid]['intensity'] = intensity
    video_props_blm[vid]['hostility'] = hostility
    video_props_blm[vid]['popularity'] = popularity
    video_props_blm[vid]['fraction_of_likes'] = (1 - (hostility+0.5))


print(">>> ABO >>>")
measures_left_abo = [video_props_abo[vid][measure_type] for vid in vids_abo['L']]
measures_right_abo = [video_props_abo[vid][measure_type] for vid in vids_abo['R']]
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='less'))
print(mannwhitneyu(measures_left_abo, measures_right_abo, alternative='two-sided'))
print(ks_2samp(measures_left_abo, measures_right_abo))
#print(len(measures_left_abo), len(measures_right_abo))
print(pg.mwu(measures_left_abo, measures_right_abo, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_abo), np.median(measures_left_abo)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_abo), np.median(measures_right_abo)))

print(">>> GUN >>>")
measures_left_gun = [video_props_gun[vid][measure_type] for vid in vids_gun['L']]
measures_right_gun = [video_props_gun[vid][measure_type] for vid in vids_gun['R']]
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='less'))
print(mannwhitneyu(measures_left_gun, measures_right_gun, alternative='two-sided'))
print(ks_2samp(measures_left_gun, measures_right_gun))
#print(len(measures_left_gun), len(measures_right_gun))
print(pg.mwu(measures_left_gun, measures_right_gun, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_gun), np.median(measures_left_gun)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_gun), np.median(measures_right_gun)))

print(">>> BLM >>>")
measures_left_blm = [video_props_blm[vid][measure_type] for vid in vids_blm['L']]
measures_right_blm = [video_props_blm[vid][measure_type] for vid in vids_blm['R']]
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='less'))
print(mannwhitneyu(measures_left_blm, measures_right_blm, alternative='two-sided'))
print(ks_2samp(measures_left_blm, measures_right_blm))
#print(len(measures_left_blm), len(measures_right_blm))
print(pg.mwu(measures_left_blm, measures_right_blm, tail='one-sided'))
print('Left -- mean: {}, median: {}'.format(np.mean(measures_left_blm), np.median(measures_left_blm)))
print('Right -- mean: {}, median: {}'.format(np.mean(measures_right_blm), np.median(measures_right_blm)))

leanings = ['Left']*len(measures_left_abo) + ['Right']*len(measures_right_abo) + ['Left']*len(measures_left_gun) + ['Right']*len(measures_right_gun) + ['Left']*len(measures_left_blm) + ['Right']*len(measures_right_blm) 
topics = ['Abortion']*(len(measures_left_abo)+len(measures_right_abo)) + ['Gun control']*(len(measures_left_gun)+len(measures_right_gun)) + ['BLM']*(len(measures_left_blm)+len(measures_right_blm))
measures = measures_left_abo + measures_right_abo + measures_left_gun + measures_right_gun + measures_left_blm + measures_right_blm

if measure_type == 'polarity':
    measure_type = 'Polarity'
elif measure_type == 'intensity':
    measure_type = 'Intensity'
elif measure_type == 'hostility':
    measure_type = 'Hostility'
elif measure_type == 'fraction_of_likes':
    measure_type = 'Fraction of likes'

data_dict = {'Leaning': leanings, 'Topic': topics, measure_type: measures}
df = pd.DataFrame(data_dict)

df.to_csv('siqi_plot/{}.csv'.format('yt_fraction_of_likes'), index=False)

plt.rcParams['figure.dpi'] = 600

'''
rc={'axes.labelsize': 16, 'legend.fontsize': 16, 
    'axes.titlesize': 12, 'xtick.labelsize': 16, 'ytick.labelsize': 12}
'''
rc={'axes.labelsize': 22, 'legend.fontsize': 22,
    'axes.titlesize': 22, 'xtick.labelsize': 22, 'ytick.labelsize': 20}

sns.set(rc=rc)
sns.set_style(style='white')

ax = sns.violinplot(x="Topic", y=measure_type, hue="Leaning", data=df, 
                    palette={"Right": "#e06666", "Left": "#6d9eeb"}, split=True, inner="quartile")

ax.set(xlabel=None)

ax.set_title('(e)', pad=-3.7 * 72, y=1)

if measure_type == 'Hostility':
    ax.set(ylim=(-0.55, 0.55))
elif measure_type == 'Fraction of Likes':
    ax.set(ylim=(0, 1.1))
elif measure_type == 'Intensity':
    ax.set(ylim=(0, None))
elif measure_type == 'Polarity':
    ax.set(ylim=(0, 1.1))

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])

#ax.legend(loc='upper right', frameon=False, fontsize=11)
#ax.spines['top'].set_visible(False)
#ax.spines['right'].set_visible(False)

ax.get_legend().set_visible(False)

#plt.savefig("youtube_reactions_{}.pdf".format(measure_type), bbox_inches = 'tight', pad_inches = 0, dpi=600)


## Identify locations of users and tweets in state-level

In [None]:
all_tweets_abo = pickle.load(open(tweet_abo.tweets_path, 'rb'))
all_users_locs_abo = pickle.load(open(tweet_abo.users_locs_path, 'rb'))
ea_users_locs_abo = pickle.load(open(tweet_abo.ea_users_locs_path, 'rb'))

all_tweets_gun = pickle.load(open(tweet_gun.tweets_path, 'rb'))
all_users_locs_gun = pickle.load(open(tweet_gun.users_locs_path, 'rb'))
ea_users_locs_gun = pickle.load(open(tweet_gun.ea_users_locs_path, 'rb'))


all_tweets_blm = pickle.load(open(tweet_blm.tweets_path, 'rb'))
all_users_locs_blm = pickle.load(open(tweet_blm.users_locs_path, 'rb'))
ea_users_locs_blm = pickle.load(open(tweet_blm.ea_users_locs_path, 'rb'))

states = location.getStates()
print(states)

In [None]:
#user_type: [EA | All]
user_type = "All"
tweets_abo = ea_tweets_abo if user_type == "EA" else all_tweets_abo
tweets_gun = ea_tweets_gun if user_type == "EA" else all_tweets_gun
tweets_blm = ea_tweets_blm if user_type == "EA" else all_tweets_blm

users_locs_abo = ea_users_locs_abo if user_type == "EA" else all_users_locs_abo
users_locs_gun = ea_users_locs_gun if user_type == "EA" else all_users_locs_gun
users_locs_blm = ea_users_locs_blm if user_type == "EA" else all_users_locs_blm

abo_loc_tweets = 0
abo_loc_users = set()
abo_users = set()
for tid in tweets_abo:
    uid = tweets_abo[tid]['_source']['user_id_str']
    loc = users_locs_abo[uid]
    if loc != None and loc in states:
        abo_loc_tweets += 1
        abo_loc_users.add(uid)
    abo_users.add(uid)

print(">>> ABO >>>")
print("{}/{} tweets {}/{} unique users with ratio {} and {}, respectively.".format(abo_loc_tweets, 
                                                                                  len(tweets_abo.keys()), 
                                                                                  len(abo_loc_users),
                                                                                  len(abo_users), 
                                                                                  float(abo_loc_tweets) / len(tweets_abo.keys()), 
                                                                                  float(len(abo_loc_users)) / len(abo_users)))

gun_loc_tweets = 0
gun_loc_users = set()
gun_users = set()
for tid in tweets_gun:
    uid = tweets_gun[tid]['_source']['user_id_str']
    loc = users_locs_gun[uid]
    if loc != None and loc in states:
        gun_loc_tweets += 1
        gun_loc_users.add(uid)
    gun_users.add(uid)

print(">>> GUN >>>")
print("{}/{} tweets {}/{} unique users with ratio {} and {}, respectively.".format(gun_loc_tweets, 
                                                                                  len(tweets_gun.keys()), 
                                                                                  len(gun_loc_users),
                                                                                  len(gun_users), 
                                                                                  float(gun_loc_tweets) / len(tweets_gun.keys()), 
                                                                                  float(len(gun_loc_users)) / len(gun_users)))

blm_loc_tweets = 0
blm_loc_users = set()
blm_users = set()
for tid in tweets_blm:
    uid = tweets_blm[tid]['_source']['user_id_str']
    loc = users_locs_blm[uid]
    if loc != None and loc in states:
        blm_loc_tweets += 1
        blm_loc_users.add(uid)
    blm_users.add(uid)

print(">>> BLM >>>")
print("{}/{} tweets {}/{} unique users with ratio {} and {}, respectively.".format(blm_loc_tweets, 
                                                                                  len(tweets_blm.keys()), 
                                                                                  len(blm_loc_users),
                                                                                  len(blm_users), 
                                                                                  float(blm_loc_tweets) / len(tweets_blm.keys()), 
                                                                                  float(len(blm_loc_users)) / len(blm_users)))

print(len(abo_users), len(gun_users), len(blm_users))


## Hashtag Analysis in User Profiles.

In [None]:
user_ids_abo = pickle.load(open(tweet_abo.ea_users_ids_path, 'rb'))
users_abo = pickle.load(open(tweet_abo.users_path, 'rb'))

user_ids_gun = pickle.load(open(tweet_gun.ea_users_ids_path, 'rb'))
users_gun = pickle.load(open(tweet_gun.users_path, 'rb'))

user_ids_blm = pickle.load(open(tweet_blm.ea_users_ids_path, 'rb'))
users_blm = pickle.load(open(tweet_blm.users_path, 'rb'))


left_hts_abo = []
with open(tweet_abo.left_extended_political_hashtags_path, 'r') as f:
    inp = f.readlines()
    left_hts_abo = [ht.rstrip() for ht in inp]

right_hts_abo = []
with open(tweet_abo.right_extended_political_hashtags_path, 'r') as f:
    inp = f.readlines()
    right_hts_abo = [ht.rstrip() for ht in inp]    

abo_users_leaning_hashtags = {}
abo_users_all_hashtags = {}
for user_id in user_ids_abo:
    user_desc = users_abo[user_id]['_source']['description']
    user_hashtags = list(set(re.findall(r"#(\w+)", user_desc)))
    user_hashtags = [tag.lower() for tag in user_hashtags]
    abo_users_all_hashtags[user_id] = len(user_hashtags)
    cnt = 0
    if len(set(user_hashtags).intersection(set(left_hts_abo+right_hts_abo))) > 0:
        for ht in user_hashtags:
            if ht in left_hts_abo or ht in right_hts_abo:
                cnt+=1
        abo_users_leaning_hashtags[user_id] = cnt
    '''
    for ht in user_hashtags:
        if ht in left_hts_abo or ht in right_hts_abo:
            cnt+=1
    abo_users_leaning_hashtags[user_id] = cnt
    '''

print(">>> ABO >>>")
print("leaning hashtags per profile: {}, all hashtags per profile {}".format(sum(list(abo_users_leaning_hashtags.values())) / len(abo_users_leaning_hashtags.values()), 
                                                                             sum(list(abo_users_all_hashtags.values())) / len(abo_users_all_hashtags.values())))

print("#seed users: {}".format(len(abo_users_leaning_hashtags.keys())))

left_hts_gun = []
with open(tweet_gun.left_extended_political_hashtags_path, 'r') as f:
    inp = f.readlines()
    left_hts_gun = [ht.rstrip() for ht in inp]

right_hts_gun = []
with open(tweet_gun.right_extended_political_hashtags_path, 'r') as f:
    inp = f.readlines()
    right_hts_gun = [ht.rstrip() for ht in inp]  

gun_users_leaning_hashtags = {}
gun_users_all_hashtags = {}
for user_id in user_ids_gun:
    user_desc = users_gun[user_id]['_source']['description']
    user_hashtags = list(set(re.findall(r"#(\w+)", user_desc)))
    user_hashtags = [tag.lower() for tag in user_hashtags]
    gun_users_all_hashtags[user_id] = len(user_hashtags)
    cnt = 0
    if len(set(user_hashtags).intersection(set(left_hts_gun+right_hts_gun))) > 0:
        for ht in user_hashtags:
            if ht in left_hts_gun or ht in right_hts_gun:
                cnt+=1
        gun_users_leaning_hashtags[user_id] = cnt
    '''
    for ht in user_hashtags:
        if ht in left_hts_gun or ht in right_hts_gun:
            cnt+=1
    gun_users_leaning_hashtags[user_id] = cnt
    '''

print(">>> GUN >>>")
print("leaning hashtags per profile: {}, all hashtags per profile {}".format(sum(list(gun_users_leaning_hashtags.values())) / len(gun_users_leaning_hashtags.values()), 
                                                                             sum(list(gun_users_all_hashtags.values())) / len(gun_users_all_hashtags.values())))

print("#seed users: {}".format(len(gun_users_leaning_hashtags.keys())))

left_hts_blm = []
with open(tweet_blm.left_extended_political_hashtags_path, 'r') as f:
    inp = f.readlines()
    left_hts_blm = [ht.rstrip() for ht in inp]

right_hts_blm = []
with open(tweet_blm.right_extended_political_hashtags_path, 'r') as f:
    inp = f.readlines()
    right_hts_blm = [ht.rstrip() for ht in inp] 

blm_users_leaning_hashtags = {}
blm_users_all_hashtags = {}
for user_id in user_ids_blm:
    user_desc = users_blm[user_id]['_source']['description']
    user_hashtags = list(set(re.findall(r"#(\w+)", user_desc)))
    user_hashtags = [tag.lower() for tag in user_hashtags]
    blm_users_all_hashtags[user_id] = len(user_hashtags)
    cnt = 0
    if len(set(user_hashtags).intersection(set(left_hts_blm+right_hts_blm))) > 0:
        for ht in user_hashtags:
            if ht in left_hts_blm or ht in right_hts_blm:
                cnt+=1
        blm_users_leaning_hashtags[user_id] = cnt
    
    '''
    for ht in user_hashtags:
        if ht in left_hts_blm or ht in right_hts_blm:
            cnt+=1
    blm_users_leaning_hashtags[user_id] = cnt
    '''


print(">>> BLM >>>")
print("leaning hashtags per profile: {}, all hashtags per profile {}".format(sum(list(blm_users_leaning_hashtags.values())) / len(blm_users_leaning_hashtags.values()), 
                                                                             sum(list(blm_users_all_hashtags.values())) / len(blm_users_all_hashtags.values())))
print("#seed users: {}".format(len(blm_users_leaning_hashtags.keys())))
