In [None]:
import json
import pickle
import numpy as np
import pandas as pd
import random
random.seed(30)
import copy
import csv
from utils import Utils
from operator import itemgetter
from tweet import Tweet
from video import Video
import location
from scipy.stats import pearsonr, spearmanr, ttest_ind, mannwhitneyu, kruskal, ks_2samp, spearmanr, pearsonr, sem, t
import matplotlib.pyplot as plt
import os

## [abo | gun | blm]
campaign = 'abo'

bin_size = 7
connection_type = 'followers'
year = 2018

util = Utils(campaign, bin_size, connection_type, year)

tweet_obj = Tweet(util)
video_obj = Video(util)

data_dir = os.path.join('data/social_media/{}/'.format(campaign), 'online_offline_analysis_data')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Summary of the data

In [None]:
initial_tweets = pickle.load(open(tweet_obj.initial_tweets_path, 'rb'))
num_initial_tweets = len(initial_tweets.keys())
initial_users = []
for tid in initial_tweets:
    uid = initial_tweets[tid]['_source']['user_id_str']
    initial_users.append(uid)
num_initial_users = len(list(set(initial_users)))
print('#initial_tweets: {}, #initial_users: {}'.format(num_initial_tweets, num_initial_users))

all_tweets_from_filtered_videos = pickle.load(open(tweet_obj.tweets_path, 'rb'))
all_users_from_filtered_videos = pickle.load(open(tweet_obj.users_path, 'rb'))
num_mid_tweets = len(all_tweets_from_filtered_videos.keys())
mid_users = []
for tid in all_tweets_from_filtered_videos:
    uid = all_tweets_from_filtered_videos[tid]['_source']['user_id_str']
    mid_users.append(uid)
num_mid_users = len(list(set(mid_users)))
print('#mid_tweets: {}, #mid_users: {}'.format(num_mid_tweets, num_mid_users))
print('#mid_users from users: {}'.format(len(all_users_from_filtered_videos.keys())))

all_videos_from_annotated_videos = pickle.load(open(tweet_obj.videos_path, 'rb'))
filtered_video_ids = tweet_obj.getFilteredVideoIds()
num_videos = len(filtered_video_ids)
videos = {}
total_view_count = 0
for vid in filtered_video_ids:
    vc = int(all_videos_from_annotated_videos[vid]['_source']['statistics']['viewCount'])
    total_view_count += vc
print('#videos: {}, #totalView: {}'.format(num_videos, total_view_count))

ea_tweets = tweet_obj.getAvailableTweets(0.2)
num_ea_tweets = len(ea_tweets.keys())
ea_users = []
for tid in ea_tweets:
    uid = ea_tweets[tid]['_source']['user_id_str']
    ea_users.append(uid)
num_ea_users = len(list(set(ea_users)))
print('#ea_tweets: {}, #ea_users: {}'.format(num_ea_tweets, num_ea_users))

ea_tweets_counts = tweet_obj.getTweetVolumeDistribution(ea_tweets)
#util.plotLineChart(available_tweets_counts, "#tweets", "weeks")

relevant_videos = {}
for vid in filtered_video_ids:
    rel_video = copy.deepcopy(all_videos_from_annotated_videos[vid])
    relevant_videos[vid] = rel_video
relevant_videos_counts = video_obj.getVideoVolumeDistribution(relevant_videos)
#util.plotLineChart(available_tweets_counts, "#videos", "weeks")

util.plotTweetVideoChart(ea_tweets_counts, relevant_videos_counts)



## Prepare video information


In [None]:
videos_info = {}
ea_videos_tweet_info = {}
all_videos_tweet_info = {}
for vid in filtered_video_ids:
    ea_videos_tweet_info[vid] = {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'users': []}
    all_videos_tweet_info[vid] = {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'users': []}

video_leanings_probs = tweet_obj.assignVideoLeaningLabels(ea_tweets)

all_videos_from_annotated_videos = pickle.load(open(tweet_obj.videos_path, 'rb'))

vids = tweet_obj.separateVideosByLeaning(video_leanings_probs)

# Got this from Siqi
videos_views_120 = pickle.load(open(video_obj.view_120_path, 'rb'))

video_first_share_time = {}
for tid in ea_tweets:
    uid = ea_tweets[tid]['_source']['user_id_str']
    tweet_type = None
    retweeted_tweet_id_str = ea_tweets[tid]['_source']['retweeted_tweet_id_str']
    quoted_tweet_id_str = ea_tweets[tid]['_source']['quoted_tweet_id_str']
    reply_user_id_str = ea_tweets[tid]['_source']['reply_user_id_str']
    timestamp = int(ea_tweets[tid]['_source']['timestamp_ms'])
    original_video_ids = ea_tweets[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = ea_tweets[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = ea_tweets[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    
    if retweeted_tweet_id_str != None and retweeted_tweet_id_str != 'N':
        tweet_type = 'retweet'
    elif (retweeted_tweet_id_str == None or retweeted_tweet_id_str == 'N') and (quoted_tweet_id_str != None and quoted_tweet_id_str != 'N'):
        tweet_type = 'quoted'
    elif reply_user_id_str != None and reply_user_id_str != 'N':
        tweet_type = 'reply'
    else:
        tweet_type = 'original'
    
    for vid in video_ids:
        if vid in filtered_video_ids:
            if vid not in video_first_share_time:
                video_first_share_time[vid] = timestamp
            else:
                if timestamp < video_first_share_time[vid]:
                    video_first_share_time[vid] = timestamp
            
            ea_videos_tweet_info[vid][tweet_type] += 1
            ea_videos_tweet_info[vid]['users'].append(uid)

for tid in all_tweets_from_filtered_videos:
    uid = all_tweets_from_filtered_videos[tid]['_source']['user_id_str']
    tweet_type = None
    retweeted_tweet_id_str = all_tweets_from_filtered_videos[tid]['_source']['retweeted_tweet_id_str']
    quoted_tweet_id_str = all_tweets_from_filtered_videos[tid]['_source']['quoted_tweet_id_str']
    reply_user_id_str = all_tweets_from_filtered_videos[tid]['_source']['reply_user_id_str']
    original_video_ids = all_tweets_from_filtered_videos[tid]['_source']['original_vids'].split(';')
    retweeted_video_ids = all_tweets_from_filtered_videos[tid]['_source']['retweeted_vids'].split(';')
    quoted_video_ids = all_tweets_from_filtered_videos[tid]['_source']['quoted_vids'].split(';')
    video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
    if 'N' in video_ids:
        video_ids.remove('N')
    
    if retweeted_tweet_id_str != None and retweeted_tweet_id_str != 'N':
        tweet_type = 'retweet'
    elif (retweeted_tweet_id_str == None or retweeted_tweet_id_str == 'N') and (quoted_tweet_id_str != None and quoted_tweet_id_str != 'N'):
        tweet_type = 'quoted'
    elif reply_user_id_str != None and reply_user_id_str != 'N':
        tweet_type = 'reply'
    else:
        tweet_type = 'original'
    
    for vid in video_ids:
        if vid in filtered_video_ids:
            all_videos_tweet_info[vid][tweet_type] += 1
            all_videos_tweet_info[vid]['users'].append(uid)
    
for vid in filtered_video_ids:
    vid_link = 'https://www.youtube.com/watch?v={}'.format(vid)
    vid_leaning_prob = video_leanings_probs[vid]['right']
    vid_leaning_label = None
    
    if vid in vids['R']:
        vid_leaning_label = 'R'
    elif vid in vids['L']:
        vid_leaning_label = 'L'
    elif vid in vids['N']:
        vid_leaning_label = 'N'
    ## May 2018 measures
    view_count = int(all_videos_from_annotated_videos[vid]['_source']['statistics']['viewCount'])
    like_count = int(all_videos_from_annotated_videos[vid]['_source']['statistics']['likeCount'])
    dislike_count = int(all_videos_from_annotated_videos[vid]['_source']['statistics']['dislikeCount'])
    comment_count = None
    if 'commentCount' in all_videos_from_annotated_videos[vid]['_source']['statistics']:
        comment_count = int(all_videos_from_annotated_videos[vid]['_source']['statistics']['commentCount'])
    else:
        #print(vid, 'commentCount missing!')
        comment_count = 'NA'
    first_share_time = video_first_share_time[vid]
    polarity, intensity, divisiveness, popularity = util.calculateVideoScores(like_count, dislike_count, view_count, first_share_time)
    ## first 120 days measures
    avg_watch = 60 * all_videos_from_annotated_videos[vid]['_source']['insights']['avgWatch']
    duration = util.convertDurationToSeconds(all_videos_from_annotated_videos[vid]['_source']['contentDetails']['duration'])
    avg_watch_perc = float(avg_watch)/duration
    
    view120 = int(videos_views_120[vid]['total_view120'])
    
    videos_info[vid] = {'id': vid, 'link': vid_link, 'leaning_label': vid_leaning_label, 'leaning_prob': vid_leaning_prob,
                        'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, 
                        'polarity': polarity, 'intensity': intensity, 'divisiveness': divisiveness, 'popularity': popularity, 
                        'avg_watch': avg_watch, 'duration': duration, 'avg_watch_perc': avg_watch_perc, 'view120': view120
                       }
    
## read structural measures
structural_measures = pickle.load(open(tweet_obj.ea_network_structural_measures_path, 'rb'))

## read temporal measures
temporal_measures = pickle.load(open(tweet_obj.ea_temporal_measures_path, 'rb'))

## read language (LIWC) measures
language_liwc_calcs = pickle.load(open(tweet_obj.ea_language_liwc_measures_path, 'rb'))
language_liwc_measures = {}
for vid in filtered_video_ids:
    language_liwc_measures[vid] = {}
    for measure_type in ['p1', 'p2', 'p3', 'sad', 'anger', 'anx', 'posemo', 'negemo', 'negate', 'wc', 'dic_wc']:
        num_tweets_with_measure = None
        if measure_type == 'wc' or measure_type == 'dic_wc':
            num_tweets_with_measure = language_liwc_calcs[vid][measure_type]
        else:
            num_tweets_with_measure = len(language_liwc_calcs[vid][measure_type])
        language_liwc_measures[vid][measure_type] = float(num_tweets_with_measure) / language_liwc_calcs[vid]['tc']

## read cascade measures
cascades = pickle.load(open(tweet_obj.ea_cascade_measures_path, 'rb'))
cascade_measures = {}
for vid in filtered_video_ids:
    cascade_measures[vid] = {}
    cascade_measures[vid]['mean'] = np.mean([cascades[vid][uid]['min'] for uid in cascades[vid]])
    cascade_measures[vid]['median'] = np.median([cascades[vid][uid]['min'] for uid in cascades[vid]])
    cascade_measures[vid]['max'] = np.amax([cascades[vid][uid]['min'] for uid in cascades[vid]])
    cascade_measures[vid]['num_sources'] = len([cascades[vid][uid]['min'] for uid in cascades[vid] if cascades[vid][uid]['min']==1])


print(sum([ea_videos_tweet_info[vid]['original'] for vid in ea_videos_tweet_info]))
print(sum([ea_videos_tweet_info[vid]['retweet'] for vid in ea_videos_tweet_info]))
print(sum([ea_videos_tweet_info[vid]['quoted'] for vid in ea_videos_tweet_info]))
print(sum([ea_videos_tweet_info[vid]['reply'] for vid in ea_videos_tweet_info]))

print(sum([all_videos_tweet_info[vid]['original'] for vid in all_videos_tweet_info]))
print(sum([all_videos_tweet_info[vid]['retweet'] for vid in all_videos_tweet_info]))
print(sum([all_videos_tweet_info[vid]['quoted'] for vid in all_videos_tweet_info]))
print(sum([all_videos_tweet_info[vid]['reply'] for vid in all_videos_tweet_info]))

with open(os.path.join(data_dir, 'ea_measures.csv'), 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter='\t')
    csv_writer.writerow(['ID', 'Link', 'Leaning Label', 'Leaning Prob', 
                         '#users (EAs)', '#original_tweets (EAs)', '#retweets (EAs)', '#quoted_tweets (EAs)', '#replies (EAs)', '#total_tweets (EAs)',
                         '#users (All)', '#original_tweets (All)', '#retweets (All)', '#quoted_tweets (All)', '#replies (All)', '#total_tweets (All)', 
                         '#views by May 2018', '#views first 120 days', '#likes by May 2018', '#dislikes by May 2018', '#comments by May 2018', 
                         'Polarity', 'Intensity', 'Hostility', 'Popularity', 
                         'AVG(watch time in sec.) for first 120 days', 'AVG(watch percentage) for first 120 days', 
                         'nw_size', 'nw_max_indegree', 'nw_density', 'nw_in_degree_centrality_gini', 'nw_closeness_centrality_gini', 'nw_betweenness_centrality_gini', 
                         'nw_global_efficiency', 'nw_degree_assortativity (in-in)', 'nw_assortativity_leaning_probs_abs_nonreciprocal', 'nw_assortativity_leaning_probs_abs_reciprocal', 
                         'nw_assortativity_leaning_labels', 'nw_assortativity_num_tweets', 
                         'lang_p1', 'lang_p2', 'lang_p3', 'lang_sad', 'lang_anger', 'lang_anx', 'lang_posemo', 'lang_negemo', 
                         'lang_negate', 'lang_wc', 'lang_dic_wc', 
                         'temp_diff_wrt_first_tweet_mean', 'temp_diff_wrt_first_tweet_median', 'temp_diff_between_pairs_mean', 
                         'temp_diff_between_pairs_median', 'temp_life_time', 'temp_diff_between_first_tweets_of_source_users_mean', 'temp_diff_between_first_tweets_of_source_users_median', 
                         'temp_diff_between_max_indegree_user', 
                         'cascade_mean', 'cascade_median', 'cascade_max(depth)', 'cascade_num_sources'])
    for vid in filtered_video_ids:
        csv_writer.writerow([videos_info[vid]['id'], videos_info[vid]['link'], videos_info[vid]['leaning_label'], videos_info[vid]['leaning_prob'], 
                             len(set(ea_videos_tweet_info[vid]['users'])), ea_videos_tweet_info[vid]['original'], ea_videos_tweet_info[vid]['retweet'], ea_videos_tweet_info[vid]['quoted'], ea_videos_tweet_info[vid]['reply'], 
                             (ea_videos_tweet_info[vid]['original'] + ea_videos_tweet_info[vid]['retweet'] + ea_videos_tweet_info[vid]['quoted'] + ea_videos_tweet_info[vid]['reply']),
                             len(set(all_videos_tweet_info[vid]['users'])), all_videos_tweet_info[vid]['original'], all_videos_tweet_info[vid]['retweet'], all_videos_tweet_info[vid]['quoted'], all_videos_tweet_info[vid]['reply'], 
                             (all_videos_tweet_info[vid]['original'] + all_videos_tweet_info[vid]['retweet'] + all_videos_tweet_info[vid]['quoted'] + all_videos_tweet_info[vid]['reply']),
                             videos_info[vid]['view_count'], videos_info[vid]['view120'], videos_info[vid]['like_count'], videos_info[vid]['dislike_count'], videos_info[vid]['comment_count'], 
                             videos_info[vid]['polarity'], videos_info[vid]['intensity'], videos_info[vid]['divisiveness'], videos_info[vid]['popularity'], 
                             videos_info[vid]['avg_watch'], videos_info[vid]['avg_watch_perc'], 
                             structural_measures[vid]['nw_size'], structural_measures[vid]['nw_max_indegree'], structural_measures[vid]['nw_density'],
                             structural_measures[vid]['nw_in_degree_centrality_gini'], structural_measures[vid]['nw_closeness_centrality_gini'], 
                             structural_measures[vid]['nw_betweenness_centrality_gini'], structural_measures[vid]['global_efficiency'], 
                             structural_measures[vid]['nw_degree_assortativity'], structural_measures[vid]['nw_assortativity_leaning_probs_abs_nw1'], structural_measures[vid]['nw_assortativity_leaning_probs_abs_nw2'], 
                             structural_measures[vid]['nw_assortativity_leaning_labels'], structural_measures[vid]['nw_assortativity_num_tweets'], 
                             language_liwc_measures[vid]['p1'], language_liwc_measures[vid]['p2'], language_liwc_measures[vid]['p3'], language_liwc_measures[vid]['sad'], 
                             language_liwc_measures[vid]['anger'], language_liwc_measures[vid]['anx'], language_liwc_measures[vid]['posemo'], language_liwc_measures[vid]['negemo'], 
                             language_liwc_measures[vid]['negate'], language_liwc_measures[vid]['wc'], language_liwc_measures[vid]['dic_wc'], 
                             temporal_measures[vid]['nw_temporal_diff_wrt_first_tweet_mean'], temporal_measures[vid]['nw_temporal_diff_wrt_first_tweet_median'], 
                             temporal_measures[vid]['nw_temporal_diff_between_pairs_mean'], temporal_measures[vid]['nw_diff_speed_mnw_temporal_diff_between_pairs_median'], 
                             temporal_measures[vid]['nw_life_time'], temporal_measures[vid]['nw_temporal_diff_between_first_tweets_of_source_users_mean'], 
                             temporal_measures[vid]['nw_temporal_diff_between_first_tweets_of_source_users_median'], temporal_measures[vid]['nw_temporal_diff_between_max_indegree_user'], 
                             cascade_measures[vid]['mean'], cascade_measures[vid]['median'], cascade_measures[vid]['max'], cascade_measures[vid]['num_sources']])




## Prepare offline data

In [None]:
states = sorted(list(location.getStates().keys()))

# gun statistics
with open('data/offline_statistics/gun_ownership_2015.json') as f:
    gun_ownership_2015 = json.load(f)
with open('data/offline_statistics/gun_per_capita_2017.json') as f:
    gun_per_capita_2017 = json.load(f)
with open('data/offline_statistics/gun_per_capita_2018.json') as f:
    gun_per_capita_2018 = json.load(f)
with open('data/offline_statistics/fatal_injury_by_firearms_all_2017.json') as f:
    fatal_injury_by_firearms_all_2017 = json.load(f)
with open('data/offline_statistics/fatal_injury_by_firearms_violence_2017.json') as f:
    fatal_injury_by_firearms_violence_2017 = json.load(f)
with open('data/offline_statistics/fatal_injury_by_firearms_violence_2018.json') as f:
    fatal_injury_by_firearms_violence_2018 = json.load(f)
with open('data/offline_statistics/fatal_injury_by_firearms_violence_2007_2016.json') as f:
    fatal_injury_by_firearms_violence_2007_2016 = json.load(f)
with open('data/offline_statistics/federal_firearm_licensees_2017.json') as f:
    federal_firearm_licensees_2017 = json.load(f)

# abortion statistics
with open('data/offline_statistics/abo_public_opinion_illegal_2014.json') as f:
    abo_public_opinion_illegal_2014 = json.load(f)
with open('data/offline_statistics/abo_public_opinion_legal_2014.json') as f:
    abo_public_opinion_legal_2014 = json.load(f)
with open('data/offline_statistics/abo_providing_facilities_2017.json') as f:
    abo_providing_facilities_2017 = json.load(f)
with open('data/offline_statistics/abo_num_abortions_per_1000_women_2017.json') as f:
    abo_num_abortions_per_1000_women_2017 = json.load(f)
with open('data/offline_statistics/abo_by_state_of_residence_2014.json') as f:
    abo_by_state_of_residence_2014 = json.load(f)

# BLM statistics
with open('data/offline_statistics/blm_protests_2015.json') as f:
    blm_protests_2015 = json.load(f)
with open('data/offline_statistics/blm_protests_2016.json') as f:
    blm_protests_2016 = json.load(f)
with open('data/offline_statistics/blm_protests_2017.json') as f:
    blm_protests_2017 = json.load(f)
with open('data/offline_statistics/blm_protests_2018.json') as f:
    blm_protests_2018 = json.load(f)
with open('data/offline_statistics/blm_protests_2019.json') as f:
    blm_protests_2019 = json.load(f)
with open('data/offline_statistics/blm_protests_2020.json') as f:
    blm_protests_2020 = json.load(f)
with open('data/offline_statistics/blm_support_Apr_2017.json') as f:
    blm_support_apr_2017 = json.load(f)
with open('data/offline_statistics/blm_support_Aug_2017.json') as f:
    blm_support_aug_2017 = json.load(f)
with open('data/offline_statistics/blm_support_Jan_2018.json') as f:
    blm_support_jan_2018 = json.load(f)
with open('data/offline_statistics/blm_support_Jul_2018.json') as f:
    blm_support_jul_2018 = json.load(f)
with open('data/offline_statistics/blm_support_Jan_2019.json') as f:
    blm_support_jan_2019 = json.load(f)
with open('data/offline_statistics/blm_support_Jan_2020.json') as f:
    blm_support_jan_2020 = json.load(f)
with open('data/offline_statistics/blm_oppose_Apr_2017.json') as f:
    blm_oppose_apr_2017 = json.load(f)
with open('data/offline_statistics/blm_oppose_Aug_2017.json') as f:
    blm_oppose_aug_2017 = json.load(f)
with open('data/offline_statistics/blm_oppose_Jan_2018.json') as f:
    blm_oppose_jan_2018 = json.load(f)
with open('data/offline_statistics/blm_oppose_Jul_2018.json') as f:
    blm_oppose_jul_2018 = json.load(f)
with open('data/offline_statistics/blm_oppose_Jan_2019.json') as f:
    blm_oppose_jan_2019 = json.load(f)
with open('data/offline_statistics/blm_oppose_Jan_2020.json') as f:
    blm_oppose_jan_2020 = json.load(f)


with open(os.path.join(data_dir, 'offline_stats.csv'), 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter='\t')
    if campaign == 'abo':
        csv_writer.writerow(['State', 'Public Opinion (Should be Legal) (2014)', 'Public Opinion (Should be Ilegal) (2014)', 
                             'Abortion Providing Facilities (2017)', 'Abortions per 1000 Women (2017)', 
                             'Abortions by State of Residence (2014)'])
        for state in states:
            csv_writer.writerow([state, abo_public_opinion_legal_2014[state], abo_public_opinion_illegal_2014[state], 
                                 abo_providing_facilities_2017[state], abo_num_abortions_per_1000_women_2017[state], 
                                 abo_by_state_of_residence_2014[state]])
    elif campaign == 'gun':
        csv_writer.writerow(['State', 'Gun Ownership (2015)', 'Guns per capita (2017)', 
                             'Fatal Injuries by Firearms (Violence) (2017)', 'Federal Firearm Licensees (2017)',
                             'Guns per capita (2018)', 'Fatal Injuries by Firearms (Violence) (2018)', 
                             'Fatal Injuries by Firearms (Violence) (2007-2016)'])
        for state in states:
            csv_writer.writerow([state, gun_ownership_2015[state], gun_per_capita_2017[state], 
                                 fatal_injury_by_firearms_violence_2017[state], federal_firearm_licensees_2017[state], 
                                 gun_per_capita_2018[state], fatal_injury_by_firearms_violence_2018[state], 
                                 fatal_injury_by_firearms_violence_2007_2016[state]])
    elif campaign == 'blm':
        csv_writer.writerow(['State', 'BLM Protests (2015)', 'BLM Protests (2016)', 'BLM Protests (2017)', 
                             'BLM Protests (2018)', 'BLM Protests (2019)', 'BLM Protests (2020)',
                             'BLM Support (Apr 2017)', 'BLM Support (Aug 2017)', 'BLM Support (Jan 2018)',
                             'BLM Support (Jul 2018)', 'BLM Support (Jan 2019)', 'BLM Support (Jan 2020)',
                             'BLM Oppose (Apr 2017)', 'BLM Oppose (Aug 2017)', 'BLM Oppose (Jan 2018)',
                             'BLM Oppose (Jul 2018)', 'BLM Oppose (Jan 2019)', 'BLM Oppose (Jan 2020)'])
        for state in states:
            csv_writer.writerow([state, blm_protests_2015[state], blm_protests_2016[state], blm_protests_2017[state], 
                                 blm_protests_2018[state], blm_protests_2019[state], blm_protests_2020[state],
                                 blm_support_apr_2017[state], blm_support_aug_2017[state], blm_support_jan_2018[state], 
                                 blm_support_jul_2018[state], blm_support_jan_2019[state], blm_support_jan_2020[state],
                                 blm_oppose_apr_2017[state], blm_oppose_aug_2017[state], blm_oppose_jan_2018[state], 
                                 blm_oppose_jul_2018[state], blm_oppose_jan_2019[state], blm_oppose_jan_2020[state]])


## by STATE analysis

In [None]:
#user_type: [EA | All]
user_type = "EA"

ea_locs = pickle.load(open(tweet_obj.ea_users_locs_path, 'rb'))
all_users_locs = pickle.load(open(tweet_obj.users_locs_path, 'rb'))
states = sorted(list(location.getStates().keys()))

tweets = ea_tweets if user_type == "EA" else all_tweets_from_filtered_videos
users_locs = ea_locs if user_type == "EA" else all_users_locs
video_leanings_probs = tweet_obj.assignVideoLeaningLabels(ea_tweets)

print(len(tweets.keys()))

vids = tweet_obj.separateVideosByLeaning(video_leanings_probs)

virality_scores = {}
with open(video_obj.virality_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        vid = row[0]
        virality = float(row[1]) * float(row[2])
        virality_scores[vid] = virality

virality_outlier_vids = ['eUd6Z_zyXZM', 'nJjTpQchohs', 'WULYEegtTGc', 'wZKZ6hpCQqo']


c_1_k = {}
c_2_k = {}
c_all_k = {}
for state in states:
    c_1_k[state] = {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                    'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0,
                    'virality_total': 0}
    c_2_k[state] = {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                    'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0,
                    'virality_total': 0}
    c_all_k[state] = {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                      'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0,
                      'virality_total': 0}

for tid in tweets:
    uid = tweets[tid]['_source']['user_id_str']
    user_loc = users_locs[uid]
    if user_loc in states:
        tweet_type = None
        retweeted_tweet_id_str = tweets[tid]['_source']['retweeted_tweet_id_str']
        quoted_tweet_id_str = tweets[tid]['_source']['quoted_tweet_id_str']
        reply_user_id_str = tweets[tid]['_source']['reply_user_id_str']
        original_video_ids = tweets[tid]['_source']['original_vids'].split(';')
        retweeted_video_ids = tweets[tid]['_source']['retweeted_vids'].split(';')
        quoted_video_ids = tweets[tid]['_source']['quoted_vids'].split(';')
        video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
        if 'N' in video_ids:
            video_ids.remove('N')

        if retweeted_tweet_id_str != None and retweeted_tweet_id_str != 'N':
            tweet_type = 'retweet'
        elif (retweeted_tweet_id_str == None or retweeted_tweet_id_str == 'N') and (quoted_tweet_id_str != None and quoted_tweet_id_str != 'N'):
            tweet_type = 'quoted'
        elif reply_user_id_str != None and reply_user_id_str != 'N':
            tweet_type = 'reply'
        else:
            tweet_type = 'original'
        
        virality_type = "virality_{}".format(tweet_type)
        
        for vid in video_ids:
            if vid in filtered_video_ids:
                if vid in vids['L']:
                    c_1_k[user_loc][tweet_type] += 1
                    c_1_k[user_loc]['total'] += 1
                    c_1_k[user_loc]['users'].append(uid)
                    if vid not in virality_outlier_vids:
                        c_1_k[user_loc][virality_type] += virality_scores[vid]
                        c_1_k[user_loc]['virality_total'] += virality_scores[vid]
                elif vid in vids['R']:
                    c_2_k[user_loc][tweet_type] += 1
                    c_2_k[user_loc]['total'] += 1
                    c_2_k[user_loc]['users'].append(uid)
                    if vid not in virality_outlier_vids:
                        c_2_k[user_loc][virality_type] += virality_scores[vid]
                        c_2_k[user_loc]['virality_total'] += virality_scores[vid]
                c_all_k[user_loc][tweet_type] += 1
                c_all_k[user_loc]['total'] += 1
                c_all_k[user_loc]['users'].append(uid)
                if vid not in virality_outlier_vids:
                    c_all_k[user_loc][virality_type] += virality_scores[vid]
                    c_all_k[user_loc]['virality_total'] += virality_scores[vid]

p_1_k = {}
p_2_k = {}
p_0_k = {}
for state in states:
    p_1_original = float(c_1_k[state]['original'])/c_all_k[state]['original'] if c_all_k[state]['original'] > 0 else np.nan #0
    p_1_retweet = float(c_1_k[state]['retweet'])/c_all_k[state]['retweet'] if c_all_k[state]['retweet'] > 0 else np.nan #0
    p_1_quoted = float(c_1_k[state]['quoted'])/c_all_k[state]['quoted'] if c_all_k[state]['quoted'] > 0 else np.nan #0
    p_1_reply = float(c_1_k[state]['reply'])/c_all_k[state]['reply'] if c_all_k[state]['reply'] > 0 else np.nan #0
    p_1_total = float(c_1_k[state]['total'])/c_all_k[state]['total'] if c_all_k[state]['total'] > 0 else np.nan #0
    p_1_users = float(len(set(c_1_k[state]['users'])))/len(set(c_all_k[state]['users'])) if len(set(c_all_k[state]['users'])) > 0 else np.nan #0
    p_1_virality_original = float(c_1_k[state]['virality_original'])/c_all_k[state]['virality_original'] if c_all_k[state]['virality_original'] > 0 else np.nan #0
    p_1_virality_retweet = float(c_1_k[state]['virality_retweet'])/c_all_k[state]['virality_retweet'] if c_all_k[state]['virality_retweet'] > 0 else np.nan #0
    p_1_virality_quoted = float(c_1_k[state]['virality_quoted'])/c_all_k[state]['virality_quoted'] if c_all_k[state]['virality_quoted'] > 0 else np.nan #0
    p_1_virality_reply = float(c_1_k[state]['virality_reply'])/c_all_k[state]['virality_reply'] if c_all_k[state]['virality_reply'] > 0 else np.nan #0
    p_1_virality_total = float(c_1_k[state]['virality_total'])/c_all_k[state]['virality_total'] if c_all_k[state]['virality_total'] > 0 else np.nan #0
    
    p_2_original = float(c_2_k[state]['original'])/c_all_k[state]['original'] if c_all_k[state]['original'] > 0 else np.nan #0
    p_2_retweet = float(c_2_k[state]['retweet'])/c_all_k[state]['retweet'] if c_all_k[state]['retweet'] > 0 else np.nan #0
    p_2_quoted = float(c_2_k[state]['quoted'])/c_all_k[state]['quoted'] if c_all_k[state]['quoted'] > 0 else np.nan #0
    p_2_reply = float(c_2_k[state]['reply'])/c_all_k[state]['reply'] if c_all_k[state]['reply'] > 0 else np.nan #0
    p_2_total = float(c_2_k[state]['total'])/c_all_k[state]['total'] if c_all_k[state]['total'] > 0 else np.nan #0
    p_2_users = float(len(set(c_2_k[state]['users'])))/len(set(c_all_k[state]['users'])) if len(set(c_all_k[state]['users'])) > 0 else np.nan #0
    p_2_virality_original = float(c_2_k[state]['virality_original'])/c_all_k[state]['virality_original'] if c_all_k[state]['virality_original'] > 0 else np.nan #0
    p_2_virality_retweet = float(c_2_k[state]['virality_retweet'])/c_all_k[state]['virality_retweet'] if c_all_k[state]['virality_retweet'] > 0 else np.nan #0
    p_2_virality_quoted = float(c_2_k[state]['virality_quoted'])/c_all_k[state]['virality_quoted'] if c_all_k[state]['virality_quoted'] > 0 else np.nan #0
    p_2_virality_reply = float(c_2_k[state]['virality_reply'])/c_all_k[state]['virality_reply'] if c_all_k[state]['virality_reply'] > 0 else np.nan #0
    p_2_virality_total = float(c_2_k[state]['virality_total'])/c_all_k[state]['virality_total'] if c_all_k[state]['virality_total'] > 0 else np.nan #0
    
    
    p_1_k[state] = {'original': p_1_original,
                    'retweet': p_1_retweet, 
                    'quoted': p_1_quoted, 
                    'reply': p_1_reply,
                    'total': p_1_total,
                    'users': p_1_users,
                    'virality_original': p_1_virality_original,
                    'virality_retweet': p_1_virality_retweet, 
                    'virality_quoted': p_1_virality_quoted, 
                    'virality_reply': p_1_virality_reply,
                    'virality_total': p_1_virality_total}
    p_2_k[state] = {'original': p_2_original,
                    'retweet': p_2_retweet, 
                    'quoted': p_2_quoted, 
                    'reply': p_2_reply,
                    'total': p_2_total,
                    'users': p_2_users,
                    'virality_original': p_2_virality_original,
                    'virality_retweet': p_2_virality_retweet, 
                    'virality_quoted': p_2_virality_quoted, 
                    'virality_reply': p_2_virality_reply,
                    'virality_total': p_2_virality_total}
    
    p_0_k[state] = {'original': float(p_1_k[state]['original']) / p_2_k[state]['original'] if p_2_k[state]['original'] > 0 else np.nan, #if p_2_k[state]['original'] != 0 else -1,
                    'retweet': float(p_1_k[state]['retweet']) / p_2_k[state]['retweet'] if p_2_k[state]['retweet'] > 0 else np.nan, 
                    'quoted': float(p_1_k[state]['quoted']) / p_2_k[state]['quoted'] if p_2_k[state]['quoted'] > 0 else np.nan, 
                    'reply': float(p_1_k[state]['reply']) / p_2_k[state]['reply'] if p_2_k[state]['reply'] > 0 else np.nan,
                    'total': float(p_1_k[state]['total']) / p_2_k[state]['total'] if p_2_k[state]['total'] > 0 else np.nan,
                    'users': float(p_1_k[state]['users']) / p_2_k[state]['users'] if p_2_k[state]['users'] > 0 else np.nan,
                    'virality_original': float(p_1_k[state]['virality_original']) / p_2_k[state]['virality_original'] if p_2_k[state]['virality_original'] > 0 else np.nan, #if p_2_k[state]['original'] != 0 else -1,
                    'virality_retweet': float(p_1_k[state]['virality_retweet']) / p_2_k[state]['virality_retweet'] if p_2_k[state]['virality_retweet'] > 0 else np.nan, 
                    'virality_quoted': float(p_1_k[state]['virality_quoted']) / p_2_k[state]['virality_quoted'] if p_2_k[state]['virality_quoted'] > 0 else np.nan, 
                    'virality_reply': float(p_1_k[state]['virality_reply']) / p_2_k[state]['virality_reply'] if p_2_k[state]['virality_reply'] > 0 else np.nan,
                    'virality_total': float(p_1_k[state]['virality_total']) / p_2_k[state]['virality_total'] if p_2_k[state]['virality_total'] > 0 else np.nan}


with open(os.path.join(data_dir, 'by_state_online_measures_{}.csv'.format(user_type.lower())), 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter='\t')
    if user_type == 'EA':
        csv_writer.writerow(['State', 
                             'p_0_k_original_tweets (EAs)', 'p_1_k_original_tweets (EAs)', 'p_2_k_original_tweets (EAs)', 
                             'p_0_k_retweets (EAs)', 'p_1_k_retweets (EAs)', 'p_2_k_retweets (EAs)', 
                             'p_0_k_quoted_tweets (EAs)', 'p_1_k_quoted_tweets (EAs)', 'p_2_k_quoted_tweets (EAs)',
                             'p_0_k_replies (EAs)', 'p_1_k_replies (EAs)', 'p_2_k_replies (EAs)', 
                             'p_0_k_total_tweets (EAs)', 'p_1_k_total_tweets (EAs)', 'p_2_k_total_tweets (EAs)', 
                             'p_0_k_users (EAs)', 'p_1_k_users (EAs)', 'p_2_k_users (EAs)', 
                             '#users_1_k (EAs)', '#users_2_k (EAs)', '#users_all_k (EAs)',
                             'p_0_k_virality_original (EAs)', 'p_1_k_virality_original (EAs)', 'p_2_k_virality_original (EAs)', 
                             'p_0_k_virality_retweet (EAs)', 'p_1_k_virality_retweet (EAs)', 'p_2_k_virality_retweet (EAs)', 
                             'p_0_k_virality_quoted (EAs)', 'p_1_k_virality_quoted (EAs)', 'p_2_k_virality_quoted (EAs)',
                             'p_0_k_virality_reply (EAs)', 'p_1_k_virality_reply (EAs)', 'p_2_k_virality_reply (EAs)', 
                             'p_0_k_virality_total (EAs)', 'p_1_k_virality_total (EAs)', 'p_2_k_virality_total (EAs)'])
    elif user_type == 'All':
        csv_writer.writerow(['State', 
                             'p_0_k_original_tweets (All)', 'p_1_k_original_tweets (All)', 'p_2_k_original_tweets (All)', 
                             'p_0_k_retweets (All)', 'p_1_k_retweets (All)', 'p_2_k_retweets (All)', 
                             'p_0_k_quoted_tweets (All)', 'p_1_k_quoted_tweets (All)', 'p_2_k_quoted_tweets (All)',
                             'p_0_k_replies (All)', 'p_1_k_replies (All)', 'p_2_k_replies (All)', 
                             'p_0_k_total_tweets (All)', 'p_1_k_total_tweets (All)', 'p_2_k_total_tweets (All)', 
                             'p_0_k_users (All)', 'p_1_k_users (All)', 'p_2_k_users (All)', 
                             '#users_1_k (All)', '#users_2_k (All)', '#users_all_k (All)',
                             'p_0_k_virality_original (All)', 'p_1_k_virality_original (All)', 'p_2_k_virality_original (All)', 
                             'p_0_k_virality_retweet (All)', 'p_1_k_virality_retweet (All)', 'p_2_k_virality_retweet (All)', 
                             'p_0_k_virality_quoted (All)', 'p_1_k_virality_quoted (All)', 'p_2_k_virality_quoted (All)',
                             'p_0_k_virality_reply (All)', 'p_1_k_virality_reply (All)', 'p_2_k_virality_reply (All)', 
                             'p_0_k_virality_total (All)', 'p_1_k_virality_total (All)', 'p_2_k_virality_total (All)'])
    
    for state in states:
        csv_writer.writerow([state, 
                             p_0_k[state]['original'], p_1_k[state]['original'], p_2_k[state]['original'], 
                             p_0_k[state]['retweet'], p_1_k[state]['retweet'], p_2_k[state]['retweet'],
                             p_0_k[state]['quoted'], p_1_k[state]['quoted'], p_2_k[state]['quoted'],
                             p_0_k[state]['reply'], p_1_k[state]['reply'], p_2_k[state]['reply'],
                             p_0_k[state]['total'], p_1_k[state]['total'], p_2_k[state]['total'],
                             p_0_k[state]['users'], p_1_k[state]['users'], p_2_k[state]['users'],
                             len(set(c_1_k[state]['users'])), len(set(c_2_k[state]['users'])), len(set(c_all_k[state]['users'])),
                             p_0_k[state]['virality_original'], p_1_k[state]['virality_original'], p_2_k[state]['virality_original'], 
                             p_0_k[state]['virality_retweet'], p_1_k[state]['virality_retweet'], p_2_k[state]['virality_retweet'],
                             p_0_k[state]['virality_quoted'], p_1_k[state]['virality_quoted'], p_2_k[state]['virality_quoted'],
                             p_0_k[state]['virality_reply'], p_1_k[state]['virality_reply'], p_2_k[state]['virality_reply'],
                             p_0_k[state]['virality_total'], p_1_k[state]['virality_total'], p_2_k[state]['virality_total']])



## by STATE, PARTY analysis

In [None]:
user_leanings_scores = pickle.load(open(tweet_obj.ea_users_inferred_leanings_scores_path, 'rb'))
tweets = ea_tweets
users_locs = pickle.load(open(tweet_obj.ea_users_locs_path, 'rb'))
states = sorted(list(location.getStates().keys()))
video_leanings_probs = tweet_obj.assignVideoLeaningLabels(ea_tweets)
print(len(tweets.keys()))

vids = tweet_obj.separateVideosByLeaning(video_leanings_probs)

virality_scores = {}
with open(video_obj.virality_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        vid = row[0]
        virality = float(row[1]) * float(row[2])
        virality_scores[vid] = virality

virality_outlier_vids = ['eUd6Z_zyXZM', 'nJjTpQchohs', 'WULYEegtTGc', 'wZKZ6hpCQqo']

c_1_k = {}
c_2_k = {}
for state in states:
    c_1_k[state] = {'L': {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                          'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0, 'virality_total': 0},
                    'R': {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [],
                          'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0, 'virality_total': 0},
                    'N': {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                          'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0, 'virality_total': 0}}
    c_2_k[state] = {'L': {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                          'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0, 'virality_total': 0},
                    'R': {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                          'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0, 'virality_total': 0},
                    'N': {'original': 0, 'retweet': 0, 'quoted': 0, 'reply': 0, 'total': 0, 'users': [], 
                          'virality_original': 0, 'virality_retweet': 0, 'virality_quoted': 0, 'virality_reply': 0, 'virality_total': 0}}

for tid in tweets:
    uid = tweets[tid]['_source']['user_id_str']
    user_loc = users_locs[uid]
    user_party = None
    if (user_leanings_scores[uid]['left'] + user_leanings_scores[uid]['right']) != 0:
        right_prob = float(user_leanings_scores[uid]['right']) / (user_leanings_scores[uid]['left'] + user_leanings_scores[uid]['right'])
        if right_prob > tweet_obj.predefined_video_leaning_thr['right']:
            user_party = 'R'
        elif right_prob < tweet_obj.predefined_video_leaning_thr['left']:
            user_party = 'L'
        else:
            user_party = 'N'
    if user_loc in states and user_party != None:
        tweet_type = None
        retweeted_tweet_id_str = tweets[tid]['_source']['retweeted_tweet_id_str']
        quoted_tweet_id_str = tweets[tid]['_source']['quoted_tweet_id_str']
        reply_user_id_str = tweets[tid]['_source']['reply_user_id_str']
        original_video_ids = tweets[tid]['_source']['original_vids'].split(';')
        retweeted_video_ids = tweets[tid]['_source']['retweeted_vids'].split(';')
        quoted_video_ids = tweets[tid]['_source']['quoted_vids'].split(';')
        video_ids = list(set(original_video_ids + retweeted_video_ids + quoted_video_ids))
        if 'N' in video_ids:
            video_ids.remove('N')

        if retweeted_tweet_id_str != None and retweeted_tweet_id_str != 'N':
            tweet_type = 'retweet'
        elif (retweeted_tweet_id_str == None or retweeted_tweet_id_str == 'N') and (quoted_tweet_id_str != None and quoted_tweet_id_str != 'N'):
            tweet_type = 'quoted'
        elif reply_user_id_str != None and reply_user_id_str != 'N':
            tweet_type = 'reply'
        else:
            tweet_type = 'original'
        
        virality_type = "virality_{}".format(tweet_type)

        for vid in video_ids:
            if vid in filtered_video_ids:
                if vid in vids['L']:
                    c_1_k[user_loc][user_party][tweet_type] += 1
                    c_1_k[user_loc][user_party]['total'] += 1
                    c_1_k[user_loc][user_party]['users'].append(uid)
                    if vid not in virality_outlier_vids:
                        c_1_k[user_loc][user_party][virality_type] += virality_scores[vid]
                        c_1_k[user_loc][user_party]['virality_total'] += virality_scores[vid]
                elif vid in vids['R']:
                    c_2_k[user_loc][user_party][tweet_type] += 1
                    c_2_k[user_loc][user_party]['total'] += 1
                    c_2_k[user_loc][user_party]['users'].append(uid)
                    if vid not in virality_outlier_vids:
                        c_2_k[user_loc][user_party][virality_type] += virality_scores[vid]
                        c_2_k[user_loc][user_party]['virality_total'] += virality_scores[vid]

p_1_k = {}
p_2_k = {}
p_0_k = {}
for state in states:
    p_1_k[state] = {}
    p_2_k[state] = {}
    p_0_k[state] = {}
    for party in ["L", "R", "N"]:
        p_1_original = float(c_1_k[state][party]['original'])/sum([c_1_k[state][p]['original'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['original'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_retweet = float(c_1_k[state][party]['retweet'])/sum([c_1_k[state][p]['retweet'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['retweet'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_quoted = float(c_1_k[state][party]['quoted'])/sum([c_1_k[state][p]['quoted'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['quoted'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_reply = float(c_1_k[state][party]['reply'])/sum([c_1_k[state][p]['reply'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['reply'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_total = float(c_1_k[state][party]['total'])/sum([c_1_k[state][p]['total'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['total'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_users = float(len(set(c_1_k[state][party]['users'])))/sum([len(set(c_1_k[state][p]['users'])) for p in c_1_k[state]]) if sum([len(set(c_1_k[state][p]['users'])) for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_virality_original = float(c_1_k[state][party]['virality_original'])/sum([c_1_k[state][p]['virality_original'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['virality_original'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_virality_retweet = float(c_1_k[state][party]['virality_retweet'])/sum([c_1_k[state][p]['virality_retweet'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['virality_retweet'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_virality_quoted = float(c_1_k[state][party]['virality_quoted'])/sum([c_1_k[state][p]['virality_quoted'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['virality_quoted'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_virality_reply = float(c_1_k[state][party]['virality_reply'])/sum([c_1_k[state][p]['virality_reply'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['virality_reply'] for p in c_1_k[state]]) > 0 else np.nan #0
        p_1_virality_total = float(c_1_k[state][party]['virality_total'])/sum([c_1_k[state][p]['virality_total'] for p in c_1_k[state]]) if sum([c_1_k[state][p]['virality_total'] for p in c_1_k[state]]) > 0 else np.nan #0
        

        p_2_original = float(c_2_k[state][party]['original'])/sum([c_2_k[state][p]['original'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['original'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_retweet = float(c_2_k[state][party]['retweet'])/sum([c_2_k[state][p]['retweet'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['retweet'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_quoted = float(c_2_k[state][party]['quoted'])/sum([c_2_k[state][p]['quoted'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['quoted'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_reply = float(c_2_k[state][party]['reply'])/sum([c_2_k[state][p]['reply'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['reply'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_total = float(c_2_k[state][party]['total'])/sum([c_2_k[state][p]['total'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['total'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_users = float(len(set(c_2_k[state][party]['users'])))/sum([len(set(c_2_k[state][p]['users'])) for p in c_2_k[state]]) if sum([len(set(c_2_k[state][p]['users'])) for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_virality_original = float(c_2_k[state][party]['virality_original'])/sum([c_2_k[state][p]['virality_original'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['virality_original'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_virality_retweet = float(c_2_k[state][party]['virality_retweet'])/sum([c_2_k[state][p]['virality_retweet'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['virality_retweet'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_virality_quoted = float(c_2_k[state][party]['virality_quoted'])/sum([c_2_k[state][p]['virality_quoted'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['virality_quoted'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_virality_reply = float(c_2_k[state][party]['virality_reply'])/sum([c_2_k[state][p]['virality_reply'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['virality_reply'] for p in c_2_k[state]]) > 0 else np.nan #0
        p_2_virality_total = float(c_2_k[state][party]['virality_total'])/sum([c_2_k[state][p]['virality_total'] for p in c_2_k[state]]) if sum([c_2_k[state][p]['virality_total'] for p in c_2_k[state]]) > 0 else np.nan #0
        
        
        p_1_k[state][party] = {'original': p_1_original,
                               'retweet': p_1_retweet, 
                               'quoted': p_1_quoted, 
                               'reply': p_1_reply,
                               'total': p_1_total,
                               'users': p_1_users,
                               'virality_original': p_1_virality_original,
                               'virality_retweet': p_1_virality_retweet, 
                               'virality_quoted': p_1_virality_quoted, 
                               'virality_reply': p_1_virality_reply,
                               'virality_total': p_1_virality_total}
        p_2_k[state][party] = {'original': p_2_original,
                               'retweet': p_2_retweet, 
                               'quoted': p_2_quoted, 
                               'reply': p_2_reply,
                               'total': p_2_total,
                               'users': p_2_users,
                               'virality_original': p_2_virality_original,
                               'virality_retweet': p_2_virality_retweet, 
                               'virality_quoted': p_2_virality_quoted, 
                               'virality_reply': p_2_virality_reply,
                               'virality_total': p_2_virality_total}

        p_0_k[state][party] = {'original': float(p_1_k[state][party]['original']) / p_2_k[state][party]['original'] if p_2_k[state][party]['original'] > 0 else np.nan, #if p_2_k[state][party]['original'] != 0 else -1,
                               'retweet': float(p_1_k[state][party]['retweet']) / p_2_k[state][party]['retweet'] if p_2_k[state][party]['retweet'] > 0 else np.nan, 
                               'quoted': float(p_1_k[state][party]['quoted']) / p_2_k[state][party]['quoted'] if p_2_k[state][party]['quoted'] > 0 else np.nan, 
                               'reply': float(p_1_k[state][party]['reply']) / p_2_k[state][party]['reply'] if p_2_k[state][party]['reply'] > 0 else np.nan,
                               'total': float(p_1_k[state][party]['total']) / p_2_k[state][party]['total'] if p_2_k[state][party]['total'] > 0 else np.nan,
                               'users': float(p_1_k[state][party]['users']) / p_2_k[state][party]['users'] if p_2_k[state][party]['users'] > 0 else np.nan,
                               'virality_original': float(p_1_k[state][party]['virality_original']) / p_2_k[state][party]['virality_original'] if p_2_k[state][party]['virality_original'] > 0 else np.nan, #if p_2_k[state][party]['original'] != 0 else -1,
                               'virality_retweet': float(p_1_k[state][party]['virality_retweet']) / p_2_k[state][party]['virality_retweet'] if p_2_k[state][party]['virality_retweet'] > 0 else np.nan, 
                               'virality_quoted': float(p_1_k[state][party]['virality_quoted']) / p_2_k[state][party]['virality_quoted'] if p_2_k[state][party]['virality_quoted'] > 0 else np.nan, 
                               'virality_reply': float(p_1_k[state][party]['virality_reply']) / p_2_k[state][party]['virality_reply'] if p_2_k[state][party]['virality_reply'] > 0 else np.nan,
                               'virality_total': float(p_1_k[state][party]['virality_total']) / p_2_k[state][party]['virality_total'] if p_2_k[state][party]['virality_total'] > 0 else np.nan
                               }



with open(os.path.join(data_dir, 'by_state_by_party_online_measures.csv'), 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter='\t')
    csv_writer.writerow(['State', 
                         'p_0_k_original_tweets (L)', 'p_0_k_original_tweets (R)', 'p_1_k_original_tweets (L)', 'p_1_k_original_tweets (R)', 'p_2_k_original_tweets (L)', 'p_2_k_original_tweets (R)', 
                         'p_0_k_retweets (L)', 'p_0_k_retweets (R)', 'p_1_k_retweets (L)', 'p_1_k_retweets (R)', 'p_2_k_retweets (L)', 'p_2_k_retweets (R)',
                         'p_0_k_quoted_tweets (L)', 'p_0_k_quoted_tweets (R)', 'p_1_k_quoted_tweets (L)', 'p_1_k_quoted_tweets (R)', 'p_2_k_quoted_tweets (L)', 'p_2_k_quoted_tweets (R)',
                         'p_0_k_replies (L)', 'p_0_k_replies (R)', 'p_1_k_replies (L)', 'p_1_k_replies (R)', 'p_2_k_replies (L)', 'p_2_k_replies (R)', 
                         'p_0_k_total_tweets (L)', 'p_0_k_total_tweets (R)', 'p_1_k_total_tweets (L)', 'p_1_k_total_tweets (R)', 'p_2_k_total_tweets (L)', 'p_2_k_total_tweets (R)', 
                         'p_0_k_users (L)', 'p_0_k_users (R)', 'p_1_k_users (L)', 'p_1_k_users (R)', 'p_2_k_users (L)', 'p_2_k_users (R)', 
                         '#users_1_k (L)', '#users_1_k (R)', '#users_2_k (L)', '#users_2_k (R)', '#users_all_k (L)', '#users_all_k (R)',
                         'p_0_k_virality_original (L)', 'p_0_k_virality_original (R)', 'p_1_k_virality_original (L)', 'p_1_k_virality_original (R)', 'p_2_k_virality_original (L)', 'p_2_k_virality_original (R)', 
                         'p_0_k_virality_retweet (L)', 'p_0_k_virality_retweet (R)', 'p_1_k_virality_retweet (L)', 'p_1_k_virality_retweet (R)', 'p_2_k_virality_retweet (L)', 'p_2_k_virality_retweet (R)',
                         'p_0_k_virality_quoted (L)', 'p_0_k_virality_quoted (R)', 'p_1_k_virality_quoted (L)', 'p_1_k_virality_quoted (R)', 'p_2_k_virality_quoted (L)', 'p_2_k_virality_quoted (R)',
                         'p_0_k_virality_reply (L)', 'p_0_k_virality_reply (R)', 'p_1_k_virality_reply (L)', 'p_1_k_virality_reply (R)', 'p_2_k_virality_reply (L)', 'p_2_k_virality_reply (R)', 
                         'p_0_k_virality_total (L)', 'p_0_k_virality_total (R)', 'p_1_k_virality_total (L)', 'p_1_k_virality_total (R)', 'p_2_k_virality_total (L)', 'p_2_k_virality_total (R)'])
    for state in states:
            csv_writer.writerow([state, 
                                 p_0_k[state]['L']['original'], p_0_k[state]['R']['original'], p_1_k[state]['L']['original'], p_1_k[state]['R']['original'], p_2_k[state]['L']['original'], p_2_k[state]['R']['original'],
                                 p_0_k[state]['L']['retweet'], p_0_k[state]['R']['retweet'], p_1_k[state]['L']['retweet'], p_1_k[state]['R']['retweet'], p_2_k[state]['L']['retweet'], p_2_k[state]['R']['retweet'],
                                 p_0_k[state]['L']['quoted'], p_0_k[state]['R']['quoted'], p_1_k[state]['L']['quoted'], p_1_k[state]['R']['quoted'], p_2_k[state]['L']['quoted'], p_2_k[state]['R']['quoted'],
                                 p_0_k[state]['L']['reply'], p_0_k[state]['R']['reply'], p_1_k[state]['L']['reply'], p_1_k[state]['R']['reply'], p_2_k[state]['L']['reply'], p_2_k[state]['R']['reply'],
                                 p_0_k[state]['L']['total'], p_0_k[state]['R']['total'], p_1_k[state]['L']['total'], p_1_k[state]['R']['total'], p_2_k[state]['L']['total'], p_2_k[state]['R']['total'],
                                 p_0_k[state]['L']['users'], p_0_k[state]['R']['users'], p_1_k[state]['L']['users'], p_1_k[state]['R']['users'], p_2_k[state]['L']['users'], p_2_k[state]['R']['users'],
                                 len(set(c_1_k[state]['L']['users'])), len(set(c_1_k[state]['R']['users'])), len(set(c_2_k[state]['L']['users'])), len(set(c_2_k[state]['R']['users'])), len(set(c_1_k[state]['L']['users'])) + len(set(c_2_k[state]['L']['users'])), len(set(c_1_k[state]['R']['users'])) + len(set(c_2_k[state]['R']['users'])),
                                 p_0_k[state]['L']['virality_original'], p_0_k[state]['R']['virality_original'], p_1_k[state]['L']['virality_original'], p_1_k[state]['R']['virality_original'], p_2_k[state]['L']['virality_original'], p_2_k[state]['R']['virality_original'],
                                 p_0_k[state]['L']['virality_retweet'], p_0_k[state]['R']['virality_retweet'], p_1_k[state]['L']['virality_retweet'], p_1_k[state]['R']['virality_retweet'], p_2_k[state]['L']['virality_retweet'], p_2_k[state]['R']['virality_retweet'],
                                 p_0_k[state]['L']['virality_quoted'], p_0_k[state]['R']['virality_quoted'], p_1_k[state]['L']['virality_quoted'], p_1_k[state]['R']['virality_quoted'], p_2_k[state]['L']['virality_quoted'], p_2_k[state]['R']['virality_quoted'],
                                 p_0_k[state]['L']['virality_reply'], p_0_k[state]['R']['virality_reply'], p_1_k[state]['L']['virality_reply'], p_1_k[state]['R']['virality_reply'], p_2_k[state]['L']['virality_reply'], p_2_k[state]['R']['virality_reply'],
                                 p_0_k[state]['L']['virality_total'], p_0_k[state]['R']['virality_total'], p_1_k[state]['L']['virality_total'], p_1_k[state]['R']['virality_total'], p_2_k[state]['L']['virality_total'], p_2_k[state]['R']['virality_total']
                                 ])

    
