In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn import metrics
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Setting Data Up

In [2]:
regression_data = pd.read_csv("../output_data/seth/sub_level_data.csv")
regression_data['log_subscribers'] = np.log(regression_data.subscribers_1 + 2)
regression_data['age_and_subs'] = regression_data.log_subscribers * regression_data.age_in_months
regression_data

Unnamed: 0,communityID,added,changed,deleted,unchanged,subscribers_1,subscribers_2,rules_1,rules_2,timestamp_1,timestamp_2,founding_date,age_in_months,log_subscribers,age_and_subs
0,007_link,0.0,0.0,0.0,1.0,7,7,1,1,1.627687e+09,1.644941e+09,1.579930e+09,14.908671,2.197225,32.757698
1,007nightfire,0.0,0.0,0.0,5.0,68,91,5,5,1.625925e+09,1.643361e+09,1.609863e+09,3.526107,4.248495,14.980651
2,00games,0.0,0.0,0.0,4.0,2,3,4,4,1.630524e+09,1.646246e+09,1.580752e+09,14.596170,1.386294,20.234588
3,00saesthetics,0.0,0.0,0.0,6.0,2836,2995,6,6,1.624697e+09,1.642362e+09,1.562924e+09,21.375357,7.950855,169.952359
4,00sbabies,0.0,0.0,0.0,6.0,300,298,6,6,1.625180e+09,1.642880e+09,1.595696e+09,8.913409,5.710427,50.899370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130846,zyramains,0.0,0.0,0.0,9.0,10085,11382,9,9,1.624172e+09,1.642224e+09,1.419736e+09,75.824785,9.219003,699.028897
130847,zyxcomments,0.0,0.0,0.0,1.0,8,8,1,1,1.627579e+09,1.644794e+09,1.562897e+09,21.385681,2.302585,49.242349
130848,zyzz,1.0,0.0,1.0,1.0,7245,11991,2,2,1.624310e+09,1.642229e+09,1.311994e+09,116.795459,8.888343,1038.118088
130849,zztails,0.0,0.0,0.0,2.0,137,142,2,2,1.625469e+09,1.643165e+09,1.546838e+09,27.492575,4.934474,135.661394


In [3]:
len(regression_data)

130851

In [4]:
len(regression_data[regression_data.added > 0])/len(regression_data)

0.049414983454463476

In [5]:
regression_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
added_rules = regression_data[regression_data.added > 0]

# Regression

In [7]:
X = regression_data[['age_in_months', 'log_subscribers', 'subscribers_1', 'age_and_subs', 'rules_1']].values.astype(np.ndarray)
y = np.array(regression_data.added)
# Split the data into training and testing sets with a 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# logistic regression
reg = LinearRegression().fit(X_train, y_train)
predictions = reg.predict(X_test)
print(f'mse: {metrics.mean_squared_error(y_test, predictions)}')
print(f'rmse: {metrics.mean_squared_error(y_test, predictions, squared=False)}')
print(f'r2: {metrics.r2_score(y_test, predictions)}')

mse: 0.5835074195334903
rmse: 0.7638765734943639
r2: 0.049288681600395456


Added rules only:

In [8]:
X = added_rules[['age_in_months', 'log_subscribers', 'subscribers_1', 'age_and_subs', 'rules_1']].values.astype(np.ndarray)
y = np.array(added_rules.added)
# Split the data into training and testing sets with a 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# logistic regression
reg = LinearRegression().fit(X_train, y_train)
predictions = reg.predict(X_test)
print(f'mse: {metrics.mean_squared_error(y_test, predictions)}')
print(f'rmse: {metrics.mean_squared_error(y_test, predictions, squared=False)}')
print(f'r2: {metrics.r2_score(y_test, predictions)}')

mse: 5.758446521037314
rmse: 2.3996763367248746
r2: 0.023299745227414292


# Regression Comparisons For All Subs

In [25]:
m01 = ols('added ~ subscribers_1 + log_subscribers + age_in_months + age_and_subs + rules_1', data=regression_data).fit()
m02 = ols('added ~ log_subscribers + age_in_months + rules_1', data=regression_data).fit()

anovaResults = anova_lm(m01, m02)
print(anovaResults)

   df_resid           ssr  df_diff      ss_diff            F  Pr(>F)
0  130845.0  73015.169198      0.0          NaN          NaN     NaN
1  130849.0  76875.951989     -4.0 -3860.782791  1642.840037     NaN


In [26]:
print(m01.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.051
Model:                            OLS   Adj. R-squared:                  0.051
Method:                 Least Squares   F-statistic:                     1418.
Date:                Sat, 13 May 2023   Prob (F-statistic):               0.00
Time:                        10:31:07   Log-Likelihood:            -1.4750e+05
No. Observations:              130851   AIC:                         2.950e+05
Df Residuals:                  130845   BIC:                         2.951e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.1405      0.007    -

# Regression Comparisons For Subs That Added Rules

In [10]:
added_rules

Unnamed: 0,communityID,added,changed,deleted,unchanged,subscribers_1,subscribers_2,rules_1,rules_2,timestamp_1,timestamp_2,founding_date,age_in_months,log_subscribers,age_and_subs
14,09axet,4.0,0.0,6.0,1.0,3,3,7,5,1.629840e+09,1.645974e+09,1.606428e+09,4.832483,1.609438,7.777581
28,0xpolygon,2.0,0.0,0.0,7.0,17427,36565,7,9,1.625983e+09,1.639596e+09,1.591701e+09,10.432636,9.765891,101.883981
80,10smusic,1.0,0.0,0.0,3.0,479,520,3,4,1.625024e+09,1.642713e+09,1.395641e+09,84.987359,6.175867,524.870650
170,18nsfw,3.0,0.0,2.0,4.0,298130,460645,6,7,1.623293e+09,1.639552e+09,1.513908e+09,40.014615,12.605292,504.395888
177,1900smusic,1.0,0.0,0.0,3.0,547,605,3,4,1.624990e+09,1.642671e+09,1.396133e+09,84.800176,6.308098,534.927859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130759,zoophiles,3.0,0.0,0.0,1.0,3,40,1,4,1.631929e+09,1.645843e+09,1.602081e+09,6.485242,1.609438,10.437594
130765,zootopiaporn,1.0,0.0,1.0,7.0,47626,60721,8,8,1.623371e+09,1.639580e+09,1.456233e+09,61.946129,10.771176,667.232665
130824,zutaranation,3.0,0.0,1.0,7.0,623,847,8,10,1.624990e+09,1.642597e+09,1.591449e+09,10.528511,6.437752,67.779942
130827,zvedatori,1.0,0.0,0.0,2.0,949,1040,2,3,1.624876e+09,1.642541e+09,1.541621e+09,29.476287,6.857514,202.134056


In [15]:
m01 = ols('added ~ subscribers_1 + log_subscribers + age_in_months + age_and_subs + rules_1', data=added_rules).fit()
m02 = ols('added ~ subscribers_1', data=added_rules).fit()
m03 = ols('added ~ log_subscribers', data=added_rules).fit()
m04 = ols('added ~ age_in_months', data=added_rules).fit()
m05 = ols('added ~ age_and_subs', data=added_rules).fit()
m02 = ols('added ~ log_subscribers + age_in_months + rules_1', data=added_rules).fit()

anovaResults = anova_lm(m01, m02)
print(anovaResults)

   df_resid           ssr  df_diff   ss_diff          F  Pr(>F)
0    6460.0  35103.610715      0.0       NaN        NaN     NaN
1    6464.0  35944.519314     -4.0 -840.9086  37.805716     NaN


In [20]:
print(m01.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     30.98
Date:                Sat, 13 May 2023   Prob (F-statistic):           2.89e-31
Time:                        10:27:46   Log-Likelihood:                -14644.
No. Observations:                6466   AIC:                         2.930e+04
Df Residuals:                    6460   BIC:                         2.934e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           2.1385      0.142     