https://medium.com/analytics-vidhya/f-statistic-understanding-model-significance-using-python-c1371980b796

https://sites.duke.edu/bossbackup/files/2013/02/FTestTutorial.pdf

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

# Preparing Data

In [2]:
metadata = pd.read_csv('metadata.csv')
metadata.drop(columns=['moderators_1', 'moderators_2'], inplace=True)
metadata.head()

Unnamed: 0,communityID,subscribers_1,subscribers_2,founding_date
0,weeklydiscoveries,3,1,1596440000.0
1,elbowsafespace,3,3,1495570000.0
2,canadastudentsnetwork,4,6,1601373000.0
3,kermitcraftofficial,3,3,1601383000.0
4,modded_server,2,2,1601386000.0


In [3]:
df = pd.read_csv("step3_rules.csv")
df.sort_values(by=["communityID", "ruleID", "source"], inplace=True)

In [4]:
df['simple_types'] = df.change_type.map({'added':'added',
 'change':'changed',
 'deleted':'deleted',
 'change_added':'changed',
 'unchanged':'unchanged'})

In [5]:
relevant_columns = df[["communityID", "simple_types"]]
piv_df = pd.pivot_table(relevant_columns, index=['communityID'], columns=['simple_types'], aggfunc=len, fill_value=0)
piv_df['delta'] = piv_df['added'] - piv_df['deleted']
piv_df.head(5)

simple_types,added,changed,deleted,unchanged,delta
communityID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000somethingintheway,0,0,0,1,0
007nightfire,0,0,0,5,0
00games,0,0,0,4,0
00saesthetics,0,0,0,6,0
00sbabies,1,0,0,5,1


In [6]:
data = metadata.join(piv_df[['delta']], on='communityID', how='inner')
data.reset_index(inplace=True)

In [7]:
def age_in_months(start_date):
    start_date = datetime.fromtimestamp(start_date) 
    end_date = datetime.strptime("2021-12-10 00:00:00.000", "%Y-%m-%d %H:%M:%S.%f") 
    return (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)

In [8]:
age = [age_in_months(data.founding_date[i]) for i in range(len(data))]
data['age_in_months'] = age

In [9]:
data['log_subscribers'] = np.log(data.subscribers_2 + 1) # natural logarithm
data['subscribers_and_age'] = data.log_subscribers * data.age_in_months
data.dropna(inplace=True)
data.reset_index(inplace=True)
data.head(5)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,level_0,index,communityID,subscribers_1,subscribers_2,founding_date,delta,age_in_months,log_subscribers,subscribers_and_age
0,0,0,weeklydiscoveries,3,1,1596440000.0,0,16,0.693147,11.090355
1,1,1,elbowsafespace,3,3,1495570000.0,0,55,1.386294,76.24619
2,2,2,canadastudentsnetwork,4,6,1601373000.0,0,15,1.94591,29.188652
3,3,3,kermitcraftofficial,3,3,1601383000.0,0,15,1.386294,20.794415
4,4,4,modded_server,2,2,1601386000.0,0,15,1.098612,16.479184


In [10]:
data.replace([np.inf, -np.inf], np.nan, inplace=True) # for some reason some subs have negative number of subscribers
data.dropna(inplace=True)
data[['communityID', 'subscribers_2', 'log_subscribers', 'age_in_months', 'subscribers_and_age', 'delta']].to_csv('data_for_modelling.csv', index=False)

# Regression Comparisons

In [43]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [44]:
model_data =  data[['subscribers_2', 'log_subscribers', 'age_in_months', 'subscribers_and_age']]

In [46]:
m_age = ols('delta ~ age_in_months', data=subsection).fit()
m_all = ols('delta ~ subscribers_2 + log_subscribers + age_in_months + subscribers_and_age', data=subsection).fit()
anovaResults = anova_lm(m_age, m_all)
print('comparison with just age')
print(anovaResults)

comparison with just age
   df_resid           ssr  df_diff      ss_diff           F  Pr(>F)
0   93988.0  92077.798809      0.0          NaN         NaN     NaN
1   93985.0  84505.310222      3.0  7572.488587  2807.31999     0.0


In [47]:
m_subscribers = ols('delta ~ subscribers_2', data=subsection).fit()
anovaResults = anova_lm(m_subscribers, m_all)
print('comparison with just subscribers')
print(anovaResults)

comparison with just subscribers
   df_resid           ssr  df_diff      ss_diff           F  Pr(>F)
0   93988.0  87107.042734      0.0          NaN         NaN     NaN
1   93985.0  84505.310222      3.0  2601.732512  964.530432     0.0


In [48]:
m_l_subscribers = ols('delta ~ log_subscribers', data=subsection).fit()
anovaResults = anova_lm(m_l_subscribers, m_all)
print('comparison with just log(subscribers)')
print(anovaResults)

comparison with just log(subscribers)
   df_resid           ssr  df_diff      ss_diff           F  Pr(>F)
0   93988.0  86632.104900      0.0          NaN         NaN     NaN
1   93985.0  84505.310222      3.0  2126.794678  788.458529     0.0


In [49]:
m_age = ols('delta ~ age_in_months', data=subsection).fit()
anovaResults = anova_lm(m_age, m_all)
print('comparison with just age')
print(anovaResults)

comparison with just age
   df_resid           ssr  df_diff      ss_diff           F  Pr(>F)
0   93988.0  92077.798809      0.0          NaN         NaN     NaN
1   93985.0  84505.310222      3.0  7572.488587  2807.31999     0.0


In [50]:
m_subscribers_and_age = ols('delta ~ subscribers_and_age', data=subsection).fit()
anovaResults = anova_lm(m_subscribers_and_age, m_all)
print('comparison with just relationship between subscribers and age')
print(anovaResults)

comparison with just relationship between subscribers and age
   df_resid           ssr  df_diff      ss_diff            F  Pr(>F)
0   93988.0  88754.371710      0.0          NaN          NaN     NaN
1   93985.0  84505.310222      3.0  4249.061488  1575.238459     0.0


In [54]:
m01 = ols('delta ~ subscribers_2 + log_subscribers + age_in_months + subscribers_and_age', data=subsection).fit()
m02 = ols('delta ~ subscribers_2 + log_subscribers + age_in_months + subscribers_and_age', data=subsection).fit()
anovaResults = anova_lm(m01, m02)
print(anovaResults)

   df_resid           ssr  df_diff     ss_diff          F        Pr(>F)
0   93986.0  84667.397791      0.0         NaN        NaN           NaN
1   93985.0  84505.310222      1.0  162.087569  180.27033  4.616420e-41
