In [1]:
import pandas as pd
import numpy as np
import os
import scipy.stats
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

# Load the data

In [3]:
data_root = os.path.join('..', 'data', 'time_series')

In [4]:
file_name_template = '{}_{}_sliced_{}_tl_bot{}.csv'

In [5]:
mapping = {
    1: 'thenation',
    2: 'thenation',
    3: 'thenation',
    4: 'washingtonpost',
    5: 'washingtonpost',
    6: 'washingtonpost',
    7: 'USATODAY',
    8: 'USATODAY',
    9: 'USATODAY',
    10: 'WSJ',
    11: 'WSJ',
    12: 'WSJ',
    13: 'BreitbartNews',
    14: 'BreitbartNews',
    15: 'BreitbartNews'
}

In [6]:
user_type_list = ['home', 'friend_usr']
methods = ['hashtag', 'url']

In [7]:
drifter_df_dict = {}
for key, seed in mapping.items():
    method_dict = {}
    for method in methods:
        user_type_dict = {}
        for user_type in user_type_list:
            temp_df = pd.read_csv(os.path.join(data_root, file_name_template.format(method, seed, user_type, key)))
            user_type_dict[user_type] = temp_df
        method_dict[method] = user_type_dict
    drifter_df_dict[key] = {
        'seed': seed,
        'dfs': method_dict
    }

# T-test for individual drifters

In [14]:
def do_t_test(df, method):
    samples = df['{}_mean_x'.format(method)] - df['{}_mean_y'.format(method)]
    t_stat, pvalue = scipy.stats.ttest_1samp(samples, 0)
    cohen_d = abs(samples.mean() - 0) / np.std(samples, ddof=1)
    return t_stat, pvalue, cohen_d

In [15]:
result = []
for key in mapping.keys():
    for method in methods:
        temp_df = drifter_df_dict[key]['dfs'][method]['home'].merge(drifter_df_dict[key]['dfs'][method]['friend_usr'], on='date')
        print(key, method, len(temp_df))
        t_stat, pvalue, cohen_d = do_t_test(temp_df, method)
        pvalue = pvalue / 2 # get the one-sided p value
        
        if cohen_d < 0.5:
            effect_size = 'small'
        elif cohen_d < 0.8:
            effect_size = 'medium'
        else:
            effect_size = 'large'
        
        result.append([
            key,
            drifter_df_dict[key]['seed'],
            method,
            t_stat,
            pvalue,
            pvalue < 0.05,
            pvalue < 0.01,
            cohen_d,
            effect_size
        ])

1 hashtag 128
1 url 131
2 hashtag 129
2 url 129
3 hashtag 130
3 url 130
4 hashtag 130
4 url 130
5 hashtag 129
5 url 132
6 hashtag 124
6 url 125
7 hashtag 131
7 url 124
8 hashtag 131
8 url 130
9 hashtag 130
9 url 130
10 hashtag 131
10 url 128
11 hashtag 128
11 url 128
12 hashtag 122
12 url 121
13 hashtag 110
13 url 108
14 hashtag 110
14 url 111
15 hashtag 132
15 url 132


In [16]:
result_df = pd.DataFrame(result, columns=[
    'drifter_id',
    'seed',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size'
])


In [17]:
result_df.query('method == "hashtag"')

Unnamed: 0,drifter_id,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size
0,1,thenation,hashtag,-1.636851,0.05206862,False,False,0.144679,small
2,2,thenation,hashtag,-3.200864,0.0008640818,True,True,0.28182,small
4,3,thenation,hashtag,-12.953258,2.12058e-25,True,True,1.136076,large
6,4,washingtonpost,hashtag,-2.787586,0.003056749,True,True,0.244487,small
8,5,washingtonpost,hashtag,-2.938476,0.0019568,True,True,0.258718,small
10,6,washingtonpost,hashtag,10.294703,1.318026e-18,True,True,0.924492,large
12,7,USATODAY,hashtag,5.997944,9.277512e-09,True,True,0.524043,medium
14,8,USATODAY,hashtag,-2.107258,0.01850812,True,False,0.184112,small
16,9,USATODAY,hashtag,0.443771,0.3289757,False,False,0.038921,small
18,10,WSJ,hashtag,-8.628202,9.454273e-15,True,True,0.75385,medium


In [18]:
result_df.query('method == "url"')

Unnamed: 0,drifter_id,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size
1,1,thenation,url,9.732323,1.888615e-17,True,True,0.850317,large
3,2,thenation,url,13.273884,4.001802e-26,True,True,1.1687,large
5,3,thenation,url,23.331027,2.0055019999999998e-48,True,True,2.046266,large
7,4,washingtonpost,url,-2.394338,0.009044539,True,True,0.209997,small
9,5,washingtonpost,url,-3.633916,0.0001998253,True,True,0.316292,small
11,6,washingtonpost,url,2.309245,0.01129216,True,False,0.206545,small
13,7,USATODAY,url,-3.521834,0.0003010486,True,True,0.31627,small
15,8,USATODAY,url,-8.649845,8.779915e-15,True,True,0.758642,medium
17,9,USATODAY,url,0.810486,0.2095769,False,False,0.071084,small
19,10,WSJ,url,-16.369727,2.191129e-33,True,True,1.446893,large


# T-test for different groups

In [19]:
reverse_mapping = {
    'thenation': [1, 2, 3],
    'washingtonpost': [4, 5, 6],
    'USATODAY': [7, 8, 9],
    'WSJ': [10, 11, 12],
    'BreitbartNews': [13, 14, 15]
}

In [24]:
combined_result = []
for seed, drifter_ids in reverse_mapping.items():
    for method in methods:
        temp_dfs = []
        for drifter_id in drifter_ids:
            temp_df = drifter_df_dict[drifter_id]['dfs'][method]['home'].merge(
                drifter_df_dict[drifter_id]['dfs'][method]['friend_usr'], on='date'
            )
            temp_dfs.append(temp_df)
        combined_df = pd.concat(temp_dfs)
        t_stat, pvalue, cohen_d = do_t_test(combined_df, method)
        pvalue = pvalue / 2 # get the one-sided p value
        
        if cohen_d < 0.5:
            effect_size = 'small'
        elif cohen_d < 0.8:
            effect_size = 'medium'
        else:
            effect_size = 'large'
        
        combined_result.append([
            seed,
            method,
            t_stat,
            pvalue,
            pvalue < 0.05,
            pvalue < 0.01,
            cohen_d,
            effect_size
        ])

In [25]:
combined_result_df = pd.DataFrame(combined_result, columns=[
    'seed',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size'
])


In [26]:
combined_result_df.query('method == "hashtag"')

Unnamed: 0,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size
0,thenation,hashtag,-6.291715,4.26199e-10,True,True,0.319826,small
2,washingtonpost,hashtag,2.570525,0.005266233,True,True,0.131348,small
4,USATODAY,hashtag,1.767523,0.03896028,True,False,0.089273,small
6,WSJ,hashtag,4.514519,4.238545e-06,True,True,0.231286,small
8,BreitbartNews,hashtag,-10.812261,5.178643e-24,True,True,0.576295,medium


In [27]:
combined_result_df.query('method == "url"')

Unnamed: 0,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size
1,thenation,url,19.614308,2.353891e-60,True,True,0.993209,large
3,washingtonpost,url,-1.983577,0.02400556,True,False,0.100831,small
5,USATODAY,url,-6.023337,2.005205e-09,True,True,0.307377,small
7,WSJ,url,-8.912869,1.0897770000000001e-17,True,True,0.459036,small
9,BreitbartNews,url,-17.529014,3.9760439999999996e-50,True,True,0.93563,large
