In [67]:
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency
import statsmodels.api as sm

In [45]:
data = pd.read_csv('cookie_cats.txt', sep=',')

In [47]:
df = pd.DataFrame(data)

In [49]:
print(data.head())

   userid  version  sum_gamerounds  retention_1  retention_7
0     116  gate_30               3        False        False
1     337  gate_30              38         True        False
2     377  gate_40             165         True        False
3     483  gate_40               1        False        False
4     488  gate_40             179         True         True


In [51]:
print(data.isnull().sum())

userid            0
version           0
sum_gamerounds    0
retention_1       0
retention_7       0
dtype: int64


In [53]:
group_30 = data[data['version'] == 'gate_30']['sum_gamerounds']
group_40 = data[data['version'] == 'gate_40']['sum_gamerounds']

In [55]:
print(f'Mean for gate_30: {group_30.mean()}')
print(f'Mean for gate_40: {group_40.mean()}')

Mean for gate_30: 52.45626398210291
Mean for gate_40: 51.29877552814966


In [57]:
u_stat, p_value = stats.mannwhitneyu(group_30, group_40)

print(f'u_stat: {u_stat}')
print(f'p-value: {p_value}')

u_stat: 1024331250.5
p-value: 0.05020880772044255


In [59]:
returned_1 = df[df['retention_1'] == True].groupby('version').size()
returned_7 = df[df['retention_7'] == True].groupby('version').size()

print(f'returned_1 true: {returned_1}')
print(f'returned_7 true: {returned_7}')

returned_1 true: version
gate_30    20034
gate_40    20119
dtype: int64
returned_7 true: version
gate_30    8502
gate_40    8279
dtype: int64


In [61]:
returned_1_0 = df[df['retention_1'] == False].groupby('version').size()
returned_7_0 = df[df['retention_7'] == False].groupby('version').size()

print(f'returned_1_0 true: {returned_1_0}')
print(f'returned_7_0 true: {returned_7_0}')

returned_1_0 true: version
gate_30    24666
gate_40    25370
dtype: int64
returned_7_0 true: version
gate_30    36198
gate_40    37210
dtype: int64


In [63]:
grouped_counts = df.groupby('version').size()

print(f'grouped_counts: {grouped_counts}')

grouped_counts: version
gate_30    44700
gate_40    45489
dtype: int64


In [69]:
z_stat, p_value = sm.stats.proportions_ztest([20034, 20119], [44700, 45489])

print(f'Z-st: {z_stat}')
print(f'p-value: {p_value}')

Z-st: 1.7840862247974725
p-value: 0.07440965529691913


In [71]:
df['version'] = df['version'].map({'gate_30': 0, 'gate_40': 1})

In [73]:
X = df[['version', 'sum_gamerounds']]  
y = df['retention_1']  

In [75]:
X = sm.add_constant(X)

In [77]:
logit_model = sm.Logit(y, X)
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: inf
         Iterations 8


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


In [79]:
print(result.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                           Logit Regression Results                           
Dep. Variable:            retention_1   No. Observations:                90189
Model:                          Logit   Df Residuals:                    90186
Method:                           MLE   Df Model:                            2
Date:                Wed, 23 Oct 2024   Pseudo R-squ.:                    -inf
Time:                        15:18:58   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                       -61972.
Covariance Type:            nonrobust   LLR p-value:                     1.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -1.1568      0.013    -90.286      0.000      -1.182      -1.132
version           -0.0303      0.015     -1.967      0.049      -0.061      -0.000
sum_gamerounds     0.0273      0.000

In [83]:
lin_model = sm.OLS(y, X)
lin_result = lin_model.fit()

print(lin_result.summary())

                            OLS Regression Results                            
Dep. Variable:            retention_1   R-squared:                       0.039
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     1834.
Date:                Wed, 23 Oct 2024   Prob (F-statistic):               0.00
Time:                        15:19:42   Log-Likelihood:                -63116.
No. Observations:               90189   AIC:                         1.262e+05
Df Residuals:                   90186   BIC:                         1.263e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.4218      0.002    179.

In [85]:
X = df[['version', 'sum_gamerounds']]  
y = df['retention_7']  

In [87]:
X = sm.add_constant(X)

In [89]:
logit_model = sm.Logit(y, X)
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.332134
         Iterations 8


In [91]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:            retention_7   No. Observations:                90189
Model:                          Logit   Df Residuals:                    90186
Method:                           MLE   Df Model:                            2
Date:                Wed, 23 Oct 2024   Pseudo R-squ.:                  0.3087
Time:                        16:39:45   Log-Likelihood:                -29955.
converged:                       True   LL-Null:                       -43333.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -2.6932      0.018   -147.963      0.000      -2.729      -2.658
version           -0.0833      0.021     -3.889      0.000      -0.125      -0.041
sum_gamerounds     0.0209      0.000

In [93]:
lin_model = sm.OLS(y, X)
lin_result = lin_model.fit()

print(lin_result.summary())

                            OLS Regression Results                            
Dep. Variable:            retention_7   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                     3820.
Date:                Wed, 23 Oct 2024   Prob (F-statistic):               0.00
Time:                        16:39:58   Log-Likelihood:                -39188.
No. Observations:               90189   AIC:                         7.838e+04
Df Residuals:                   90186   BIC:                         7.841e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.1610      0.002     89.

In [95]:
max_value = df['sum_gamerounds'].max()

In [97]:
df = df[df['sum_gamerounds'] != max_value]

In [99]:
df.reset_index(drop=True, inplace=True)

In [101]:
print(f'Mean for gate_30: {group_30.mean()}')
print(f'Mean for gate_40: {group_40.mean()}')

Mean for gate_30: 52.45626398210291
Mean for gate_40: 51.29877552814966


In [105]:
Q1 = df['sum_gamerounds'].quantile(0.25)
Q3 = df['sum_gamerounds'].quantile(0.75)  
IQR = Q3 - Q1  

In [107]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [109]:
df_clean = df[(df['sum_gamerounds'] >= lower_bound) & (df['sum_gamerounds'] <= upper_bound)]
print(f"Num of deleted rows: {len(df) - len(df_clean)}")

Num of deleted rows: 10176
