In [11]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Setting Data Up

In [2]:
regression_data = pd.read_csv("../output_data/seth/sub_level_data.csv")
regression_data['log_subscribers'] = np.log(regression_data.subscribers_1 + 2)
regression_data['age_and_subs'] = regression_data.log_subscribers * regression_data.age_in_months
regression_data

Unnamed: 0,communityID,added,changed,deleted,unchanged,subscribers_1,subscribers_2,rules_1,rules_2,timestamp_1,timestamp_2,founding_date,age_in_months,log_subscribers,age_and_subs
0,007_link,0.0,0.0,0.0,1.0,7,7,1,1,1.627687e+09,1.644941e+09,1.579930e+09,14.908671,2.197225,32.757698
1,007nightfire,0.0,0.0,0.0,5.0,68,91,5,5,1.625925e+09,1.643361e+09,1.609863e+09,3.526107,4.248495,14.980651
2,00games,0.0,0.0,0.0,4.0,2,3,4,4,1.630524e+09,1.646246e+09,1.580752e+09,14.596170,1.386294,20.234588
3,00saesthetics,0.0,0.0,0.0,6.0,2836,2995,6,6,1.624697e+09,1.642362e+09,1.562924e+09,21.375357,7.950855,169.952359
4,00sbabies,0.0,0.0,0.0,6.0,300,298,6,6,1.625180e+09,1.642880e+09,1.595696e+09,8.913409,5.710427,50.899370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130846,zyramains,0.0,0.0,0.0,9.0,10085,11382,9,9,1.624172e+09,1.642224e+09,1.419736e+09,75.824785,9.219003,699.028897
130847,zyxcomments,0.0,0.0,0.0,1.0,8,8,1,1,1.627579e+09,1.644794e+09,1.562897e+09,21.385681,2.302585,49.242349
130848,zyzz,1.0,0.0,1.0,1.0,7245,11991,2,2,1.624310e+09,1.642229e+09,1.311994e+09,116.795459,8.888343,1038.118088
130849,zztails,0.0,0.0,0.0,2.0,137,142,2,2,1.625469e+09,1.643165e+09,1.546838e+09,27.492575,4.934474,135.661394


In [3]:
regression_data.replace([np.inf, -np.inf], np.nan, inplace=True)

communityID        False
added              False
changed            False
deleted            False
unchanged          False
subscribers_1      False
subscribers_2      False
rules_1            False
rules_2            False
timestamp_1        False
timestamp_2        False
founding_date      False
age_in_months      False
log_subscribers    False
age_and_subs       False
dtype: bool

# Regression Comparisons

In [4]:
m01 = ols('added ~ subscribers_2 + log_subscribers + age_in_months + age_and_subs', data=regression_data).fit()
m02 = ols('added ~ subscribers_2', data=regression_data).fit()
m03 = ols('added ~ log_subscribers', data=regression_data).fit()
m04 = ols('added ~ age_in_months', data=regression_data).fit()
m05 = ols('added ~ age_and_subs', data=regression_data).fit()
m06 = ols('added ~ log_subscribers + age_in_months', data=regression_data).fit()

anovaResults = anova_lm(m01, m02)
print(anovaResults)

   df_resid           ssr  df_diff      ss_diff            F  Pr(>F)
0  130846.0  73521.001122      0.0          NaN          NaN     NaN
1  130849.0  76853.348823     -3.0 -3332.347701  1891.196549     NaN


In [5]:
print(m01.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.045
Method:                 Least Squares   F-statistic:                     1535.
Date:                Thu, 11 May 2023   Prob (F-statistic):               0.00
Time:                        12:30:41   Log-Likelihood:            -1.4795e+05
No. Observations:              130851   AIC:                         2.959e+05
Df Residuals:                  130846   BIC:                         2.960e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.0575      0.006     

In [6]:
print(m02.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     199.2
Date:                Thu, 11 May 2023   Prob (F-statistic):           3.31e-45
Time:                        12:30:45   Log-Likelihood:            -1.5085e+05
No. Observations:              130851   AIC:                         3.017e+05
Df Residuals:                  130849   BIC:                         3.017e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.1259      0.002     59.315

In [15]:
anovaResults = anova_lm(m01, m03)
print(anovaResults)

   df_resid           ssr  df_diff     ss_diff           F  Pr(>F)
0  130846.0  73521.001122      0.0         NaN         NaN     NaN
1  130849.0  73725.426845     -3.0 -204.425722  120.939286     NaN


In [7]:
print(m03.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     5759.
Date:                Thu, 11 May 2023   Prob (F-statistic):               0.00
Time:                        12:30:56   Log-Likelihood:            -1.4813e+05
No. Observations:              130851   AIC:                         2.963e+05
Df Residuals:                  130849   BIC:                         2.963e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.1405      0.004    -

In [16]:
anovaResults = anova_lm(m01, m04)
print(anovaResults)

   df_resid           ssr  df_diff      ss_diff            F  Pr(>F)
0  130846.0  73521.001122      0.0          NaN          NaN     NaN
1  130849.0  75966.366684     -3.0 -2445.365562  1404.014488     NaN


In [8]:
print(m04.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     1729.
Date:                Thu, 11 May 2023   Prob (F-statistic):               0.00
Time:                        12:31:07   Log-Likelihood:            -1.5009e+05
No. Observations:              130851   AIC:                         3.002e+05
Df Residuals:                  130849   BIC:                         3.002e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0396      0.003     13.252

In [17]:
anovaResults = anova_lm(m01, m05)
print(anovaResults)

   df_resid           ssr  df_diff      ss_diff           F  Pr(>F)
0  130846.0  73521.001122      0.0          NaN         NaN     NaN
1  130849.0  74787.299119     -3.0 -1266.297997  738.511434     NaN


In [9]:
print(m05.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     3820.
Date:                Thu, 11 May 2023   Prob (F-statistic):               0.00
Time:                        12:31:16   Log-Likelihood:            -1.4907e+05
No. Observations:              130851   AIC:                         2.981e+05
Df Residuals:                  130849   BIC:                         2.982e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.0416      0.003     16.551   

In [19]:
anovaResults = anova_lm(m01, m06)
print(anovaResults)

   df_resid           ssr  df_diff     ss_diff           F  Pr(>F)
0  130846.0  73521.001122      0.0         NaN         NaN     NaN
1  130848.0  73722.391941     -2.0 -201.390819  178.721723     NaN


In [10]:
print(m06.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     2882.
Date:                Thu, 11 May 2023   Prob (F-statistic):               0.00
Time:                        12:31:22   Log-Likelihood:            -1.4813e+05
No. Observations:              130851   AIC:                         2.963e+05
Df Residuals:                  130848   BIC:                         2.963e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.1411      0.004    -