In [1]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import os

# Setting Data Up

In [2]:
data = pd.read_csv("step3_rules.csv")
data['simple_types'] = data.change_type.map({'added':'added',
 'deleted':'deleted',
 'changed':'changed',
 'change_added':'changed',
 'unchanged':'unchanged'})

In [3]:
metadata = pd.read_csv("metadata.csv")
metadata['log_subscribers'] = np.log(metadata.subscribers_1 + 2)
metadata.founding_date = pd.to_datetime(metadata.founding_date * 10**9)
metadata['age_in_months'] = \
        (pd.to_datetime('2021-12-11') - metadata.founding_date)/ np.timedelta64(1, 'M')
metadata.drop(columns=['subscribers_1', 'moderators_1', 'moderators_2', 'founding_date'], inplace=True)

In [4]:
relevant_columns = data[["communityID", "simple_types"]]
piv_df = pd.pivot_table(relevant_columns, index=['communityID'], columns=['simple_types'], aggfunc=len, fill_value=0)
piv_df['total'] = piv_df.added + piv_df.changed + piv_df.deleted + piv_df.unchanged

In [5]:
piv_df['delta'] = piv_df.added - piv_df.deleted

In [6]:
regression_data = piv_df[['delta']].join(metadata.set_index('communityID'))
regression_data['age_and_subs'] = regression_data.log_subscribers * regression_data.age_in_months
regression_data

Unnamed: 0_level_0,delta,subscribers_2,log_subscribers,age_in_months,age_and_subs
communityID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000somethingintheway,0,0,0.693147,33.246182,23.044497
007nightfire,0,91,4.248495,11.148441,47.364097
00games,0,3,1.386294,22.218503,30.801385
00saesthetics,0,2995,7.950855,28.997690,230.556424
00sbabies,1,298,5.710427,16.535742,94.426147
...,...,...,...,...,...
zyramains,0,11382,9.219003,83.447118,769.299207
zyrocrimson,3,2,1.386294,15.794000,21.895133
zywieczdrojhomies,0,1,1.098612,8.089328,8.887036
zyzz,0,11991,8.888343,124.417792,1105.867998


In [7]:
regression_data.replace([np.inf, -np.inf], np.nan, inplace=True)
regression_data.isna().any()

delta              False
subscribers_2      False
log_subscribers    False
age_in_months      False
age_and_subs       False
dtype: bool

# Regression Comparisons

In [8]:
m01 = ols('delta ~ subscribers_2 + log_subscribers + age_in_months + age_and_subs', data=regression_data).fit()
m02 = ols('delta ~ subscribers_2', data=regression_data).fit()
m03 = ols('delta ~ log_subscribers', data=regression_data).fit()
m04 = ols('delta ~ age_in_months', data=regression_data).fit()
m05 = ols('delta ~ age_and_subs', data=regression_data).fit()
anovaResults = anova_lm(m01, m02)
print(anovaResults)

   df_resid           ssr  df_diff     ss_diff           F  Pr(>F)
0   94036.0  84448.333498      0.0         NaN         NaN     NaN
1   94039.0  87119.998448     -3.0 -2671.66495  961.282158     NaN


In [9]:
anovaResults = anova_lm(m01, m03)
print(anovaResults)

   df_resid           ssr  df_diff      ss_diff           F  Pr(>F)
0   94036.0  84448.333498      0.0          NaN         NaN     NaN
1   94039.0  86404.454175     -3.0 -1956.120677  709.653355     NaN


In [10]:
anovaResults = anova_lm(m01, m04)
print(anovaResults)

   df_resid           ssr  df_diff      ss_diff            F  Pr(>F)
0   94036.0  84448.333498      0.0          NaN          NaN     NaN
1   94039.0  92085.684676     -3.0 -7637.351178  2599.784718     NaN


In [11]:
anovaResults = anova_lm(m01, m05)
print(anovaResults)

   df_resid           ssr  df_diff      ss_diff            F  Pr(>F)
0   94036.0  84448.333498      0.0          NaN          NaN     NaN
1   94039.0  88854.517932     -3.0 -4406.184434  1554.425472     NaN
