https://medium.com/analytics-vidhya/f-statistic-understanding-model-significance-using-python-c1371980b796

https://sites.duke.edu/bossbackup/files/2013/02/FTestTutorial.pdf

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

# Preparing Data

In [2]:
metadata = pd.read_csv('metadata.csv')
metadata.drop(columns=['moderators_1', 'moderators_2'], inplace=True)
metadata.head()

Unnamed: 0,communityID,subscribers_1,subscribers_2,founding_date
0,weeklydiscoveries,3,1,1596440000.0
1,elbowsafespace,3,3,1495570000.0
2,canadastudentsnetwork,4,6,1601373000.0
3,kermitcraftofficial,3,3,1601383000.0
4,modded_server,2,2,1601386000.0


In [3]:
df = pd.read_csv("step3_rules.csv")
df.sort_values(by=["communityID", "ruleID", "source"], inplace=True)

In [4]:
df['simple_types'] = df.change_type.map({'added':'added',
 'change':'changed',
 'deleted':'deleted',
 'change_added':'changed',
 'unchanged':'unchanged'})

In [5]:
relevant_columns = df[["communityID", "simple_types"]]
piv_df = pd.pivot_table(relevant_columns, index=['communityID'], columns=['simple_types'], aggfunc=len, fill_value=0)
piv_df['delta'] = piv_df['added'] - piv_df['deleted']
piv_df.head(5)

simple_types,added,changed,deleted,unchanged,delta
communityID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000somethingintheway,0,0,0,1,0
007nightfire,0,0,0,5,0
00games,0,0,0,4,0
00saesthetics,0,0,0,6,0
00sbabies,1,0,0,5,1


In [6]:
data = metadata.join(piv_df[['delta']], on='communityID', how='inner')
data.reset_index(inplace=True)

In [7]:
def age_in_months(start_date):
    start_date = datetime.fromtimestamp(start_date) 
    end_date = datetime.strptime("2021-12-10 00:00:00.000", "%Y-%m-%d %H:%M:%S.%f") 
    return (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)

In [8]:
age = [age_in_months(data.founding_date[i]) for i in range(len(data))]
data['age_in_months'] = age

In [9]:
data['log_subscribers'] = np.log(data.subscribers_2 + 1) # natural logarithm
data['subscribers_and_age'] = data.log_subscribers * data.age_in_months
data.dropna(inplace=True)
data.reset_index(inplace=True)
data.head(5)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,level_0,index,communityID,subscribers_1,subscribers_2,founding_date,delta,age_in_months,log_subscribers,subscribers_and_age
0,0,0,weeklydiscoveries,3,1,1596440000.0,0,16,0.693147,11.090355
1,1,1,elbowsafespace,3,3,1495570000.0,0,55,1.386294,76.24619
2,2,2,canadastudentsnetwork,4,6,1601373000.0,0,15,1.94591,29.188652
3,3,3,kermitcraftofficial,3,3,1601383000.0,0,15,1.386294,20.794415
4,4,4,modded_server,2,2,1601386000.0,0,15,1.098612,16.479184


In [10]:
data.replace([np.inf, -np.inf], np.nan, inplace=True) # for some reason some subs have negative number of subscribers
data.dropna(inplace=True)
data[['communityID', 'subscribers_2', 'log_subscribers', 'age_in_months', 'subscribers_and_age', 'delta']].to_csv('data_for_modelling.csv', index=False)

# Regression Comparisons

In [11]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [12]:
X = data[['subscribers_2', 'log_subscribers', 'age_in_months', 'subscribers_and_age']]
X = sm.add_constant(X) # add intercept
y = data.delta

  x = pd.concat(x[::order], 1)


In [25]:
def f_test(ssr1, ssr2):
    if ssr1 > ssr2:
        return (ssr1/ssr2)
    else:
        return(ssr2/ssr1)

## All Features

Critical value is 2.3719

In [16]:
results = sm.OLS(y, X).fit()

A = np.identity(len(results.params))
A = A[1:,:]
print(results.f_test(A))

<F test: F=array([[2703.9995492]]), p=0.0, df_denom=9.4e+04, df_num=4>


In [19]:
ss1 = results.ssr

In [14]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  delta   R-squared:                       0.103
Model:                            OLS   Adj. R-squared:                  0.103
Method:                 Least Squares   F-statistic:                     2704.
Date:                Thu, 05 Jan 2023   Prob (F-statistic):               0.00
Time:                        10:46:54   Log-Likelihood:            -1.2837e+05
No. Observations:               93990   AIC:                         2.567e+05
Df Residuals:                   93985   BIC:                         2.568e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.0133    

## Subscribers

Critical value for all of the following is 3.8415

In [20]:
subscribers_X = np.array(X.subscribers_2).reshape(-1, 1)

results = sm.OLS(y, subscribers_X).fit()

A = np.identity(len(results.params))
A = A[1:,:]

print(results.f_test(1))

<F test: F=array([[11504.91111045]]), p=0.0, df_denom=9.4e+04, df_num=1>


In [29]:
ss1

84505.31022178079

In [28]:
results.ssr

89201.15465382104

In [27]:
f_test(ss1, results.ssr)

0.9473566855691021

for calculating p value: https://www.statology.org/f-distribution-calculator/

p value here is 0.51352

## Log of Subscribers

In [30]:
log_subscribers_X = np.array(X.log_subscribers).reshape(-1, 1)

results = sm.OLS(y, log_subscribers_X).fit()

A = np.identity(len(results.params))
A = A[1:,:]

print(results.f_test(1))

<F test: F=array([[13935.95477599]]), p=0.0, df_denom=9.4e+04, df_num=1>


In [31]:
results.ssr

87191.8704949352

## Age in Months

In [32]:
age_X = np.array(X.age_in_months).reshape(-1, 1)

results = sm.OLS(y, age_X).fit()

A = np.identity(len(results.params))
A = A[1:,:]

print(results.f_test(1))

<F test: F=array([[8012.15383508]]), p=0.0, df_denom=9.4e+04, df_num=1>


In [33]:
results.ssr

92255.6101200039

## Relationship Between Age and Subscribers

In [34]:
both_X = np.array(X.subscribers_and_age).reshape(-1, 1)

results = sm.OLS(y, both_X).fit()

A = np.identity(len(results.params))
A = A[1:,:]

print(results.f_test(1))

<F test: F=array([[11603.16871187]]), p=0.0, df_denom=9.4e+04, df_num=1>


In [35]:
results.ssr

89118.14952562912