In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn import metrics
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Setting Data Up

In [2]:
regression_data = pd.read_csv("../output_data/seth/sub_level_data.csv")
regression_data['log_subscribers'] = np.log(regression_data.subscribers_1 + 2)
regression_data['age_and_subs'] = regression_data.log_subscribers * regression_data.age_in_months
regression_data['rules_and_subs'] = regression_data.log_subscribers * regression_data.rules_1
regression_data['rules_and_age'] = regression_data.age_in_months * regression_data.rules_1
regression_data

Unnamed: 0,communityID,added,changed,deleted,unchanged,subscribers_1,subscribers_2,rules_1,rules_2,timestamp_1,timestamp_2,founding_date,age_in_months,log_subscribers,age_and_subs,rules_and_subs,rules_and_age
0,007_link,0.0,0.0,0.0,1.0,7,7,1,1,1.627687e+09,1.644941e+09,1.579930e+09,14.908671,2.197225,32.757698,2.197225,14.908671
1,007nightfire,0.0,0.0,0.0,5.0,68,91,5,5,1.625925e+09,1.643361e+09,1.609863e+09,3.526107,4.248495,14.980651,21.242476,17.630537
2,00games,0.0,0.0,0.0,4.0,2,3,4,4,1.630524e+09,1.646246e+09,1.580752e+09,14.596170,1.386294,20.234588,5.545177,58.384679
3,00saesthetics,0.0,0.0,0.0,6.0,2836,2995,6,6,1.624697e+09,1.642362e+09,1.562924e+09,21.375357,7.950855,169.952359,47.705129,128.252141
4,00sbabies,0.0,0.0,0.0,6.0,300,298,6,6,1.625180e+09,1.642880e+09,1.595696e+09,8.913409,5.710427,50.899370,34.262562,53.480452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130846,zyramains,0.0,0.0,0.0,9.0,10085,11382,9,9,1.624172e+09,1.642224e+09,1.419736e+09,75.824785,9.219003,699.028897,82.971025,682.423061
130847,zyxcomments,0.0,0.0,0.0,1.0,8,8,1,1,1.627579e+09,1.644794e+09,1.562897e+09,21.385681,2.302585,49.242349,2.302585,21.385681
130848,zyzz,1.0,0.0,1.0,1.0,7245,11991,2,2,1.624310e+09,1.642229e+09,1.311994e+09,116.795459,8.888343,1038.118088,17.776686,233.590919
130849,zztails,0.0,0.0,0.0,2.0,137,142,2,2,1.625469e+09,1.643165e+09,1.546838e+09,27.492575,4.934474,135.661394,9.868948,54.985150


In [3]:
len(regression_data)

130851

In [4]:
len(regression_data[regression_data.added > 0])/len(regression_data)

0.049414983454463476

In [5]:
regression_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
added_rules = regression_data[regression_data.added > 0]

# Regression

## Rule Additions

In [7]:
X = regression_data[['age_in_months', 'log_subscribers', 'subscribers_1', 
                     'age_and_subs', 'rules_1', 'rules_and_subs', 'rules_and_age']].values.astype(np.ndarray)
y = np.array(regression_data.added)
# Split the data into training and testing sets with a 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# logistic regression
reg = LinearRegression().fit(X_train, y_train)
predictions = reg.predict(X_test)
print(f'mse: {metrics.mean_squared_error(y_test, predictions)}')
print(f'rmse: {metrics.mean_squared_error(y_test, predictions, squared=False)}')
print(f'r2: {metrics.r2_score(y_test, predictions)}')

mse: 0.5786090526948314
rmse: 0.7606635607775828
r2: 0.05405085950898725


## Rule Deletions

In [8]:
X = regression_data[['age_in_months', 'log_subscribers', 'subscribers_1', 'age_and_subs', 'rules_1']].values.astype(np.ndarray)
y = np.array(regression_data.deleted)
# Split the data into training and testing sets with a 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# logistic regression
reg = LinearRegression().fit(X_train, y_train)
predictions = reg.predict(X_test)
print(f'mse: {metrics.mean_squared_error(y_test, predictions)}')
print(f'rmse: {metrics.mean_squared_error(y_test, predictions, squared=False)}')
print(f'r2: {metrics.r2_score(y_test, predictions)}')

mse: 0.2888126955312714
rmse: 0.5374129655407203
r2: 0.04129618861297513


# Regression Comparisons For All Subs

In [9]:
m01 = ols('added ~ subscribers_1 + log_subscribers + age_in_months + age_and_subs + rules_1 + rules_and_subs + rules_and_age', data=regression_data).fit()
m02 = ols('added ~ log_subscribers + age_in_months + rules_1', data=regression_data).fit()

anovaResults = anova_lm(m01, m02)
print(anovaResults)

   df_resid           ssr  df_diff     ss_diff           F  Pr(>F)
0  130843.0  72508.958755      0.0         NaN         NaN     NaN
1  130847.0  73138.903762     -4.0 -629.945007  281.746137     NaN


In [10]:
print(m01.summary())

                            OLS Regression Results                            
Dep. Variable:                  added   R-squared:                       0.058
Model:                            OLS   Adj. R-squared:                  0.058
Method:                 Least Squares   F-statistic:                     1150.
Date:                Tue, 16 May 2023   Prob (F-statistic):               0.00
Time:                        18:21:53   Log-Likelihood:            -1.4705e+05
No. Observations:              130851   AIC:                         2.941e+05
Df Residuals:                  130843   BIC:                         2.942e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0159      0.008     