In [1]:
import pandas as pd
import numpy as np
import os
import scipy.stats
import matplotlib.pyplot as plt
from functools import reduce


In [2]:
%matplotlib inline

# Load the data

In [3]:
data_root = os.path.join('..', 'data', 'time_series')

In [4]:
file_name_template = '{}_{}_sliced_{}_tl_bot{}.csv'

In [5]:
mapping = {
    1: 'thenation',
    2: 'thenation',
    3: 'thenation',
    4: 'washingtonpost',
    5: 'washingtonpost',
    6: 'washingtonpost',
    7: 'USATODAY',
    8: 'USATODAY',
    9: 'USATODAY',
    10: 'WSJ',
    11: 'WSJ',
    12: 'WSJ',
    13: 'BreitbartNews',
    14: 'BreitbartNews',
    15: 'BreitbartNews'
}

In [6]:
user_type_list = ['home', 'friend_usr']
methods = ['hashtag', 'url']

In [7]:
drifter_df_dict = {}
for key, seed in mapping.items():
    method_dict = {}
    for method in methods:
        user_type_dict = {}
        for user_type in user_type_list:
            temp_df = pd.read_csv(os.path.join(data_root, file_name_template.format(method, seed, user_type, key)))
            user_type_dict[user_type] = temp_df
        method_dict[method] = user_type_dict
    drifter_df_dict[key] = {
        'seed': seed,
        'dfs': method_dict
    }

# T-test for different groups

In [8]:
reverse_mapping = {
    'thenation': [1, 2, 3],
    'washingtonpost': [4, 5, 6],
    'USATODAY': [7, 8, 9],
    'WSJ': [10, 11, 12],
    'BreitbartNews': [13, 14, 15]
}

In [9]:
name_mapping = {
    'thenation': 'Left',
    'washingtonpost': 'C. Left',
    'USATODAY': 'Center',
    'WSJ': 'C. Right',
    'BreitbartNews': 'Right'
}

Calculate the difference between political alignment scores of the home timelines and their friends’ user timelines.

In [10]:
alignment_diffs = {}
for method in methods:
    temp_alignment_diffs = []
    for seed, drifter_ids in reverse_mapping.items():
        temp_dfs = []
        for drifter_id in drifter_ids:
            temp_df = drifter_df_dict[drifter_id]['dfs'][method]['home'].merge(
                drifter_df_dict[drifter_id]['dfs'][method]['friend_usr'], on='date'
            )
            temp_dfs.append(temp_df)
        combined_df = pd.concat(temp_dfs)
        temp_diff = combined_df['{}_mean_x'.format(method)] - combined_df['{}_mean_y'.format(method)]
        temp_diff = temp_diff.to_frame(name=seed).reset_index()[[seed]]
                
        temp_alignment_diffs.append(temp_diff)
    alignment_diffs[method] = temp_alignment_diffs

In [11]:
def concat_dfs(dfs):
    return reduce(
        lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='outer'),
        dfs
    ).rename(columns=name_mapping)

In [12]:
alignment_diffs_url = concat_dfs(alignment_diffs['url'])

In [13]:
# Dump the raw data for sharing
#alignment_diffs_url.to_csv("table_s3_url.csv", index=None)

In [14]:
alignment_diffs_hashtag = concat_dfs(alignment_diffs['hashtag'])

In [15]:
# Dump the raw data for sharing
# alignment_diffs_hashtag.to_csv("table_s3_hashtag.csv", index=None)

In [16]:
def do_t_test(samples):
    t_stat, pvalue = scipy.stats.ttest_1samp(samples, 0)
    cohen_d = abs(samples.mean() - 0) / np.std(samples, ddof=1)
    return t_stat, pvalue, cohen_d

In [17]:
url_t_test_results = []
for value in name_mapping.values():
    t_stat, pvalue, cohen_d = do_t_test(alignment_diffs_url[value].dropna())
    
    if cohen_d < 0.5:
        effect_size = 'small'
    elif cohen_d < 0.8:
        effect_size = 'medium'
    else:
        effect_size = 'large'
        
    url_t_test_results.append([
        value,
        "link",
        t_stat,
        pvalue,
        pvalue < 0.05,
        pvalue < 0.01,
        cohen_d,
        effect_size,
        alignment_diffs_url[value].count()
    ])
    
url_t_test_results_df = pd.DataFrame(url_t_test_results, columns=[
    'group',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size',
    'n'
])


In [18]:
url_t_test_results_df

Unnamed: 0,group,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,n
0,Left,link,4.146558,4.144082e-05,True,True,0.2097,small,391
1,C. Left,link,-2.324818,0.02059695,True,False,0.117873,small,389
2,Center,link,-15.156787,3.611706e-41,True,True,0.764559,medium,393
3,C. Right,link,-4.092429,5.205458e-05,True,True,0.20857,small,385
4,Right,link,-4.954729,1.128585e-06,True,True,0.264088,small,352


In [19]:
hashtag_t_test_results = []
for value in name_mapping.values():
    t_stat, pvalue, cohen_d = do_t_test(alignment_diffs_hashtag[value].dropna())
    
    if cohen_d < 0.5:
        effect_size = 'small'
    elif cohen_d < 0.8:
        effect_size = 'medium'
    else:
        effect_size = 'large'
        
    hashtag_t_test_results.append([
        value,
        "hashtag",
        t_stat,
        pvalue,
        pvalue < 0.05,
        pvalue < 0.01,
        cohen_d,
        effect_size,
        alignment_diffs_hashtag[value].count()
    ])
    
hashtag_t_test_results_df = pd.DataFrame(hashtag_t_test_results, columns=[
    'group',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size',
    'n'
])


In [20]:
hashtag_t_test_results_df

Unnamed: 0,group,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,n
0,Left,hashtag,-6.033468,3.763578e-09,True,True,0.306698,small,387
1,C. Left,hashtag,2.573025,0.01045799,True,False,0.131475,small,383
2,Center,hashtag,1.797072,0.07309583,False,False,0.090766,small,392
3,C. Right,hashtag,4.669914,4.186471e-06,True,True,0.239247,small,381
4,Right,hashtag,-10.573009,7.251819e-23,True,True,0.563543,medium,352
