In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from statsmodels.formula.api import logit
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt

# Setting Data Up

In [2]:
regression_data = pd.read_csv("../output_data/seth/sub_level_data.csv")
regression_data['log_subscribers'] = np.log(regression_data.subscribers_1 + 2)
regression_data['age_in_years'] = regression_data.age_in_months/12

regression_data

Unnamed: 0,communityID,added,changed,deleted,unchanged,subscribers_1,subscribers_2,rules_1,rules_2,timestamp_1,timestamp_2,founding_date,age_in_months,log_subscribers,age_in_years
0,007_link,0.0,0.0,0.0,1.0,7,7,1,1,1.627687e+09,1.644941e+09,1.579930e+09,14.908671,2.197225,1.242389
1,007nightfire,0.0,0.0,0.0,5.0,68,91,5,5,1.625925e+09,1.643361e+09,1.609863e+09,3.526107,4.248495,0.293842
2,00games,0.0,0.0,0.0,4.0,2,3,4,4,1.630524e+09,1.646246e+09,1.580752e+09,14.596170,1.386294,1.216347
3,00saesthetics,0.0,0.0,0.0,6.0,2836,2995,6,6,1.624697e+09,1.642362e+09,1.562924e+09,21.375357,7.950855,1.781280
4,00sbabies,0.0,0.0,0.0,6.0,300,298,6,6,1.625180e+09,1.642880e+09,1.595696e+09,8.913409,5.710427,0.742784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130846,zyramains,0.0,0.0,0.0,9.0,10085,11382,9,9,1.624172e+09,1.642224e+09,1.419736e+09,75.824785,9.219003,6.318732
130847,zyxcomments,0.0,0.0,0.0,1.0,8,8,1,1,1.627579e+09,1.644794e+09,1.562897e+09,21.385681,2.302585,1.782140
130848,zyzz,1.0,0.0,1.0,1.0,7245,11991,2,2,1.624310e+09,1.642229e+09,1.311994e+09,116.795459,8.888343,9.732955
130849,zztails,0.0,0.0,0.0,2.0,137,142,2,2,1.625469e+09,1.643165e+09,1.546838e+09,27.492575,4.934474,2.291048


In [3]:
# Creating dataset
regression_data['addition_correspondence'] = np.nan
regression_data.loc[((regression_data.added == 1) & (regression_data.deleted == 0) & (regression_data.changed == 0)), 'addition_correspondence'] = 1
regression_data.loc[((regression_data.added > 1) & (regression_data.deleted == 0) & (regression_data.changed == 0)), 'addition_correspondence'] = 0
regression_data.loc[((regression_data.added >= 1) & ((regression_data.deleted != 0) | (regression_data.changed != 0))), 'addition_correspondence'] = 0

regression_data['deletion_correspondence'] = np.nan
regression_data.loc[((regression_data.deleted == 1) & (regression_data.added == 0) & (regression_data.changed == 0)), 'deletion_correspondence'] = 1
regression_data.loc[((regression_data.deleted > 1) & (regression_data.added == 0) & (regression_data.changed == 0)), 'deletion_correspondence'] = 0
regression_data.loc[((regression_data.deleted >= 1) & ((regression_data.added != 0) | (regression_data.changed != 0))), 'deletion_correspondence'] = 0

regression_data['change_correspondence'] = np.nan
regression_data.loc[((regression_data.changed == 1) & (regression_data.added == 0) & (regression_data.deleted == 0)), 'change_correspondence'] = 1
regression_data.loc[((regression_data.changed > 1) & (regression_data.added == 0) & (regression_data.deleted == 0)), 'change_correspondence'] = 0
regression_data.loc[((regression_data.changed >= 1) & ((regression_data.added != 0) | (regression_data.deleted != 0))), 'change_correspondence'] = 0


In [4]:
regression_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [5]:
added_data = regression_data[regression_data.addition_correspondence.notna()]
deleted_data = regression_data[regression_data.deletion_correspondence.notna()]
changed_data = regression_data[regression_data.change_correspondence.notna()]

In [9]:
added_data

Unnamed: 0,communityID,added,changed,deleted,unchanged,subscribers_1,subscribers_2,rules_1,rules_2,timestamp_1,timestamp_2,founding_date,age_in_months,log_subscribers,age_in_years,addition_correspondence,deletion_correspondence,change_correspondence
14,09axet,4.0,0.0,6.0,1.0,3,3,7,5,1.629840e+09,1.645974e+09,1.606428e+09,4.832483,1.609438,0.402707,0.0,0.0,
28,0xpolygon,2.0,0.0,0.0,7.0,17427,36565,7,9,1.625983e+09,1.639596e+09,1.591701e+09,10.432636,9.765891,0.869386,0.0,,
80,10smusic,1.0,0.0,0.0,3.0,479,520,3,4,1.625024e+09,1.642713e+09,1.395641e+09,84.987359,6.175867,7.082280,1.0,,
170,18nsfw,3.0,0.0,2.0,4.0,298130,460645,6,7,1.623293e+09,1.639552e+09,1.513908e+09,40.014615,12.605292,3.334551,0.0,0.0,
177,1900smusic,1.0,0.0,0.0,3.0,547,605,3,4,1.624990e+09,1.642671e+09,1.396133e+09,84.800176,6.308098,7.066681,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130759,zoophiles,3.0,0.0,0.0,1.0,3,40,1,4,1.631929e+09,1.645843e+09,1.602081e+09,6.485242,1.609438,0.540437,0.0,,
130765,zootopiaporn,1.0,0.0,1.0,7.0,47626,60721,8,8,1.623371e+09,1.639580e+09,1.456233e+09,61.946129,10.771176,5.162177,0.0,0.0,
130824,zutaranation,3.0,0.0,1.0,7.0,623,847,8,10,1.624990e+09,1.642597e+09,1.591449e+09,10.528511,6.437752,0.877376,0.0,0.0,
130827,zvedatori,1.0,0.0,0.0,2.0,949,1040,2,3,1.624876e+09,1.642541e+09,1.541621e+09,29.476287,6.857514,2.456357,1.0,,


# Regression

In [5]:
exp = """change_correspondence ~ log_subscribers + age_in_years + rules_1 + log_subscribers*age_in_years
        + log_subscribers*rules_1 + age_in_years*rules_1"""
m01 = logit(exp, data=regression_data).fit()
print(m01.summary())

Optimization terminated successfully.
         Current function value: 0.593797
         Iterations 5
                             Logit Regression Results                            
Dep. Variable:     change_correspondence   No. Observations:                  568
Model:                             Logit   Df Residuals:                      561
Method:                              MLE   Df Model:                            6
Date:                   Sun, 28 May 2023   Pseudo R-squ.:                 0.01201
Time:                           12:33:18   Log-Likelihood:                -337.28
converged:                          True   LL-Null:                       -341.38
Covariance Type:               nonrobust   LLR p-value:                    0.2240
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept                       -0.0484      0.7

In [6]:
exp = """addition_correspondence ~ log_subscribers + age_in_years + rules_1 + log_subscribers*age_in_years
        + log_subscribers*rules_1 + age_in_years*rules_1"""
m01 = logit(exp, data=regression_data).fit()
print(m01.summary())

Optimization terminated successfully.
         Current function value: 0.600332
         Iterations 5
                              Logit Regression Results                             
Dep. Variable:     addition_correspondence   No. Observations:                 6466
Model:                               Logit   Df Residuals:                     6459
Method:                                MLE   Df Model:                            6
Date:                     Sun, 28 May 2023   Pseudo R-squ.:                 0.02103
Time:                             12:33:20   Log-Likelihood:                -3881.7
converged:                            True   LL-Null:                       -3965.1
Covariance Type:                 nonrobust   LLR p-value:                 2.149e-33
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept                       

In [7]:
exp = """deletion_correspondence ~ log_subscribers + age_in_years + rules_1 + log_subscribers*age_in_years
        + log_subscribers*rules_1 + age_in_years*rules_1"""
m01 = logit(exp, data=regression_data).fit()
print(m01.summary())

Optimization terminated successfully.
         Current function value: 0.228822
         Iterations 7
                              Logit Regression Results                             
Dep. Variable:     deletion_correspondence   No. Observations:                 3878
Model:                               Logit   Df Residuals:                     3871
Method:                                MLE   Df Model:                            6
Date:                     Sun, 28 May 2023   Pseudo R-squ.:                0.008250
Time:                             12:33:21   Log-Likelihood:                -887.37
converged:                            True   LL-Null:                       -894.75
Covariance Type:                 nonrobust   LLR p-value:                   0.02217
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept                       