In [1]:
import pandas as pd
import numpy as np
import os
import scipy.stats
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

# Load the data

In [3]:
data_root = os.path.join('..', 'data', 'time_series')

In [4]:
file_name_template = '{}_{}_sliced_{}_tl_bot{}.csv'

In [5]:
mapping = {
    1: 'thenation',
    2: 'thenation',
    3: 'thenation',
    4: 'washingtonpost',
    5: 'washingtonpost',
    6: 'washingtonpost',
    7: 'USATODAY',
    8: 'USATODAY',
    9: 'USATODAY',
    10: 'WSJ',
    11: 'WSJ',
    12: 'WSJ',
    13: 'BreitbartNews',
    14: 'BreitbartNews',
    15: 'BreitbartNews'
}

In [6]:
user_type_list = ['home', 'friend_usr']
methods = ['hashtag', 'url']

In [7]:
drifter_df_dict = {}
for key, seed in mapping.items():
    method_dict = {}
    for method in methods:
        user_type_dict = {}
        for user_type in user_type_list:
            temp_df = pd.read_csv(os.path.join(data_root, file_name_template.format(method, seed, user_type, key)))
            user_type_dict[user_type] = temp_df
        method_dict[method] = user_type_dict
    drifter_df_dict[key] = {
        'seed': seed,
        'dfs': method_dict
    }

# T-test for individual drifters

In [8]:
def do_t_test(df, method):
    samples = df['{}_mean_x'.format(method)] - df['{}_mean_y'.format(method)]
    t_stat, pvalue = scipy.stats.ttest_1samp(samples, 0)
    cohen_d = abs(samples.mean() - 0) / np.std(samples, ddof=1)
    return t_stat, pvalue, cohen_d

In [9]:
result = []
for key in mapping.keys():
    for method in methods:
        temp_df = drifter_df_dict[key]['dfs'][method]['home'].merge(drifter_df_dict[key]['dfs'][method]['friend_usr'], on='date')
        print(key, method, len(temp_df))
        t_stat, pvalue, cohen_d = do_t_test(temp_df, method)
        
        if cohen_d < 0.5:
            effect_size = 'small'
        elif cohen_d < 0.8:
            effect_size = 'medium'
        else:
            effect_size = 'large'
        
        result.append([
            key,
            drifter_df_dict[key]['seed'],
            method,
            t_stat,
            pvalue,
            pvalue < 0.05,
            pvalue < 0.01,
            cohen_d,
            effect_size,
            len(temp_df) - 1
        ])

1 hashtag 128
1 url 131
2 hashtag 129
2 url 130
3 hashtag 130
3 url 130
4 hashtag 130
4 url 130
5 hashtag 129
5 url 133
6 hashtag 124
6 url 126
7 hashtag 131
7 url 132
8 hashtag 131
8 url 131
9 hashtag 130
9 url 130
10 hashtag 131
10 url 131
11 hashtag 128
11 url 129
12 hashtag 122
12 url 125
13 hashtag 110
13 url 109
14 hashtag 110
14 url 111
15 hashtag 132
15 url 132


In [10]:
result_df = pd.DataFrame(result, columns=[
    'drifter_id',
    'seed',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size',
    'degree_freedom'
])


In [11]:
result_df.query('method == "hashtag"')

Unnamed: 0,drifter_id,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
0,1,thenation,hashtag,-1.522673,0.1303268,False,False,0.134587,small,127
2,2,thenation,hashtag,-3.048948,0.002791088,True,True,0.268445,small,128
4,3,thenation,hashtag,-12.606068,3.040132e-24,True,True,1.105625,large,129
6,4,washingtonpost,hashtag,-2.796928,0.005949444,True,True,0.245307,small,129
8,5,washingtonpost,hashtag,-2.889366,0.004535051,True,True,0.254394,small,128
10,6,washingtonpost,hashtag,10.316953,2.3282470000000002e-18,True,True,0.92649,large,123
12,7,USATODAY,hashtag,5.943683,2.408727e-08,True,True,0.519302,medium,130
14,8,USATODAY,hashtag,-2.046321,0.04274081,True,False,0.178788,small,130
16,9,USATODAY,hashtag,0.462284,0.6446567,False,False,0.040545,small,129
18,10,WSJ,hashtag,-8.390671,7.045338e-14,True,True,0.733096,medium,130


In [12]:
result_df.query('method == "url"')

Unnamed: 0,drifter_id,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
1,1,thenation,url,-0.460945,0.6456085,False,False,0.040273,small,130
3,2,thenation,url,1.264867,0.2081991,False,False,0.110936,small,129
5,3,thenation,url,13.083664,2.0277080000000002e-25,True,True,1.147513,large,129
7,4,washingtonpost,url,-1.855723,0.06577524,False,False,0.162758,small,129
9,5,washingtonpost,url,-3.247054,0.001478398,True,True,0.281555,small,132
11,6,washingtonpost,url,0.233938,0.8154157,False,False,0.020841,small,125
13,7,USATODAY,url,-9.477751,1.50977e-16,True,True,0.824932,large,131
15,8,USATODAY,url,-16.436262,1.547596e-33,True,True,1.436043,large,130
17,9,USATODAY,url,-3.041662,0.00285071,True,True,0.266771,small,129
19,10,WSJ,url,-17.89402,7.112384e-37,True,True,1.563408,large,130


# T-test for different groups

In [13]:
reverse_mapping = {
    'thenation': [1, 2, 3],
    'washingtonpost': [4, 5, 6],
    'USATODAY': [7, 8, 9],
    'WSJ': [10, 11, 12],
    'BreitbartNews': [13, 14, 15]
}

In [14]:
combined_result = []
for seed, drifter_ids in reverse_mapping.items():
    for method in methods:
        temp_dfs = []
        for drifter_id in drifter_ids:
            temp_df = drifter_df_dict[drifter_id]['dfs'][method]['home'].merge(
                drifter_df_dict[drifter_id]['dfs'][method]['friend_usr'], on='date'
            )
            temp_dfs.append(temp_df)
        combined_df = pd.concat(temp_dfs)
        t_stat, pvalue, cohen_d = do_t_test(combined_df, method)

        
        if cohen_d < 0.5:
            effect_size = 'small'
        elif cohen_d < 0.8:
            effect_size = 'medium'
        else:
            effect_size = 'large'
        
        combined_result.append([
            seed,
            method,
            t_stat,
            pvalue,
            pvalue < 0.05,
            pvalue < 0.01,
            cohen_d,
            effect_size,
            len(combined_df) - 1
        ])

In [15]:
combined_result_df = pd.DataFrame(combined_result, columns=[
    'seed',
    'method',
    't_stat',
    'pvalue',
    'significant_05',
    'significant_01',
    'cohen_d',
    'effect_size',
    'degree_freedom'
])


In [16]:
combined_result_df.query('method == "hashtag"')

Unnamed: 0,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
0,thenation,hashtag,-6.033468,3.763578e-09,True,True,0.306698,small,386
2,washingtonpost,hashtag,2.573025,0.01045799,True,False,0.131475,small,382
4,USATODAY,hashtag,1.797072,0.07309583,False,False,0.090766,small,391
6,WSJ,hashtag,4.669914,4.186471e-06,True,True,0.239247,small,380
8,BreitbartNews,hashtag,-10.573009,7.251819e-23,True,True,0.563543,medium,351


In [17]:
combined_result_df.query('method == "url"')

Unnamed: 0,seed,method,t_stat,pvalue,significant_05,significant_01,cohen_d,effect_size,degree_freedom
1,thenation,url,4.146558,4.144082e-05,True,True,0.2097,small,390
3,washingtonpost,url,-2.324818,0.02059695,True,False,0.117873,small,388
5,USATODAY,url,-15.156787,3.611706e-41,True,True,0.764559,medium,392
7,WSJ,url,-4.092429,5.205458e-05,True,True,0.20857,small,384
9,BreitbartNews,url,-4.954729,1.128585e-06,True,True,0.264088,small,351
