In [1]:
import pandas as pd
import numpy as np
import os
import scipy.stats
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

# Load the data

In [3]:
data_root = os.path.join('..', 'data', 'time_series')

In [4]:
file_name_template = '{}_{}_sliced_{}_tl_bot{}.csv'

In [5]:
mapping = {
    1: 'thenation',
    2: 'thenation',
    3: 'thenation',
    4: 'washingtonpost',
    5: 'washingtonpost',
    6: 'washingtonpost',
    7: 'USATODAY',
    8: 'USATODAY',
    9: 'USATODAY',
    10: 'WSJ',
    11: 'WSJ',
    12: 'WSJ',
    13: 'BreitbartNews',
    14: 'BreitbartNews',
    15: 'BreitbartNews'
}

In [6]:
user_type_list = ['home', 'friend_usr']
methods = ['hashtag', 'url']

In [7]:
drifter_df_dict = {}
for key, seed in mapping.items():
    method_dict = {}
    for method in methods:
        user_type_dict = {}
        for user_type in user_type_list:
            temp_df = pd.read_csv(os.path.join(data_root, file_name_template.format(method, seed, user_type, key)))
            user_type_dict[user_type] = temp_df
        method_dict[method] = user_type_dict
    drifter_df_dict[key] = {
        'seed': seed,
        'dfs': method_dict
    }

# T-test for individual drifters

In [8]:
def do_t_test(df, method):
    samples = df['{}_mean_x'.format(method)] - df['{}_mean_y'.format(method)]
    t_stat, pvalue = scipy.stats.ttest_1samp(samples, 0)
    cohen_d = abs(samples.mean() - 0) / np.std(samples, ddof=1)
    return t_stat, pvalue, cohen_d

In [9]:
result = []
for key in mapping.keys():
    for method in methods:
        temp_df = drifter_df_dict[key]['dfs'][method]['home'].merge(drifter_df_dict[key]['dfs'][method]['friend_usr'], on='date')
        print(key, method, len(temp_df))
        t_stat, pvalue, cohen_d = do_t_test(temp_df, method)
        pvalue = pvalue / 2 # get the one-sided p value
        
        if cohen_d < 0.5:
            effect_size = 'small'
        elif cohen_d < 0.8:
            effect_size = 'medium'
        else:
            effect_size = 'large'
        
        result.append([
            key,
            drifter_df_dict[key]['seed'],
            method,
            t_stat,
            pvalue,
            pvalue < 0.05,
            pvalue < 0.01,
            cohen_d,
            effect_size,
            len(temp_df) - 1
        ])

1 hashtag 127
1 url 130
2 hashtag 129
2 url 128
3 hashtag 129
3 url 129
4 hashtag 129
4 url 129
5 hashtag 129
5 url 130
6 hashtag 124
6 url 125
7 hashtag 130
7 url 123
8 hashtag 130
8 url 129
9 hashtag 129
9 url 129
10 hashtag 130
10 url 127
11 hashtag 128
11 url 127
12 hashtag 121
12 url 120
13 hashtag 110
13 url 108
14 hashtag 110
14 url 111
15 hashtag 131
15 url 131


In [10]:
result_df = pd.DataFrame(result, columns=[
    'drifter_id',
    'seed',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size',
    'degree_freedom'
])


In [11]:
result_df.query('method == "hashtag"')

Unnamed: 0,drifter_id,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
0,1,thenation,hashtag,-1.626545,0.05316604,False,False,0.144333,small,126
2,2,thenation,hashtag,-3.267319,0.0006969685,True,True,0.287671,small,128
4,3,thenation,hashtag,-12.669569,1.206509e-24,True,True,1.115493,large,128
6,4,washingtonpost,hashtag,-2.730299,0.003609957,True,True,0.240389,small,128
8,5,washingtonpost,hashtag,-2.7804,0.003124516,True,True,0.244801,small,128
10,6,washingtonpost,hashtag,9.714708,3.330731e-17,True,True,0.872407,large,123
12,7,USATODAY,hashtag,6.244094,2.84482e-09,True,True,0.547643,medium,129
14,8,USATODAY,hashtag,-1.810612,0.03626411,True,False,0.158801,small,129
16,9,USATODAY,hashtag,1.323339,0.09404086,False,False,0.116513,small,128
18,10,WSJ,hashtag,-8.455488,2.571669e-14,True,True,0.741595,medium,129


In [12]:
result_df.query('method == "url"')

Unnamed: 0,drifter_id,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
1,1,thenation,url,8.903309,2.143155e-15,True,True,0.780872,medium,129
3,2,thenation,url,11.889763,1.126322e-22,True,True,1.050917,large,127
5,3,thenation,url,19.717014,6.577809999999999e-41,True,True,1.735986,large,128
7,4,washingtonpost,url,-5.493706,1.019964e-07,True,True,0.483694,small,128
9,5,washingtonpost,url,-5.243816,3.128204e-07,True,True,0.459913,small,129
11,6,washingtonpost,url,-1.95108,0.02665137,True,False,0.17451,small,124
13,7,USATODAY,url,-4.261684,2.009482e-05,True,True,0.384263,small,122
15,8,USATODAY,url,-11.234018,4.217385e-21,True,True,0.9891,large,128
17,9,USATODAY,url,-2.158791,0.01636594,True,False,0.190071,small,128
19,10,WSJ,url,-18.152073,2.86816e-37,True,True,1.610736,large,126


# T-test for different groups

In [13]:
reverse_mapping = {
    'thenation': [1, 2, 3],
    'washingtonpost': [4, 5, 6],
    'USATODAY': [7, 8, 9],
    'WSJ': [10, 11, 12],
    'BreitbartNews': [13, 14, 15]
}

In [18]:
combined_result = []
for seed, drifter_ids in reverse_mapping.items():
    for method in methods:
        temp_dfs = []
        for drifter_id in drifter_ids:
            temp_df = drifter_df_dict[drifter_id]['dfs'][method]['home'].merge(
                drifter_df_dict[drifter_id]['dfs'][method]['friend_usr'], on='date'
            )
            temp_dfs.append(temp_df)
        combined_df = pd.concat(temp_dfs)
        t_stat, pvalue, cohen_d = do_t_test(combined_df, method)
        pvalue = pvalue / 2 # get the one-sided p value
        
        if cohen_d < 0.5:
            effect_size = 'small'
        elif cohen_d < 0.8:
            effect_size = 'medium'
        else:
            effect_size = 'large'
        
        combined_result.append([
            seed,
            method,
            t_stat,
            pvalue,
            pvalue < 0.05,
            pvalue < 0.01,
            cohen_d,
            effect_size,
            len(combined_df) - 1
        ])

In [19]:
combined_result_df = pd.DataFrame(combined_result, columns=[
    'seed',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size',
    'degree_freedom'
])


In [20]:
combined_result_df.query('method == "hashtag"')

Unnamed: 0,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
0,thenation,hashtag,-6.47431,1.456901e-10,True,True,0.329961,small,384
2,washingtonpost,hashtag,2.513181,0.006188345,True,True,0.128586,small,381
4,USATODAY,hashtag,2.611772,0.004678492,True,True,0.132422,small,388
6,WSJ,hashtag,4.696627,1.853698e-06,True,True,0.24125,small,378
8,BreitbartNews,hashtag,-10.387262,1.645783e-22,True,True,0.554431,medium,350


In [21]:
combined_result_df.query('method == "url"')

Unnamed: 0,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
1,thenation,url,17.751165,2.850981e-52,True,True,0.902342,large,386
3,washingtonpost,url,-7.067937,3.743983e-12,True,True,0.360684,small,383
5,USATODAY,url,-8.80429,2.35836e-17,True,True,0.451057,small,380
7,WSJ,url,-11.573221,5.6234960000000005e-27,True,True,0.598437,medium,373
9,BreitbartNews,url,-20.924612,7.021069e-64,True,True,1.118468,large,349
