In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('cookie_cats.csv')

# Data validation
- check outliers
- missing data
- data entry errors

In [None]:
data.head()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,gate_30,3,False,False
1,337,gate_30,38,True,False
2,377,gate_40,165,True,False
3,483,gate_40,1,False,False
4,488,gate_40,179,True,True


In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
userid            0
version           0
sum_gamerounds    0
retention_1       0
retention_7       0
dtype: int64


In [None]:
# Check for outliers
numeric_columns = ['sum_gamerounds']
for column in numeric_columns:
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    if not outliers.empty:
        print("Outliers in column", column)
        print(outliers)

Outliers in column sum_gamerounds
        userid  version  sum_gamerounds  retention_1  retention_7
2          377  gate_40             165         True        False
4          488  gate_40             179         True         True
5          540  gate_40             187         True         True
9         1587  gate_40             153         True        False
14        2218  gate_30             305         True         True
...        ...      ...             ...          ...          ...
90121  9991145  gate_30             328         True         True
90125  9991408  gate_40             186         True         True
90134  9991949  gate_30             191         True         True
90150  9995412  gate_40             253         True         True
90160  9996269  gate_30             143        False        False

[10177 rows x 5 columns]


In [None]:
data = data.drop(outliers.index)

In [None]:
# Validate numerical variables
for column in numeric_columns:
    statistics = data[column].describe()
    print("Statistics for column", column)
    print(statistics)

Statistics for column sum_gamerounds
count    80012.000000
mean        23.979178
std         27.481366
min          0.000000
25%          4.000000
50%         13.000000
75%         34.000000
max        120.000000
Name: sum_gamerounds, dtype: float64


In [None]:
# Check for duplicated rows
duplicates = data.duplicated()
print("Duplicate Rows:")
print(data[duplicates])

# Check for inconsistent data types
data_types = data.dtypes
print("Data Types:")
print(data_types)

Duplicate Rows:
Empty DataFrame
Columns: [userid, version, sum_gamerounds, retention_1, retention_7]
Index: []
Data Types:
userid             int64
version           object
sum_gamerounds     int64
retention_1         bool
retention_7         bool
dtype: object


# Descriptive statistics

## using libraries scipy

In [None]:
# Split data into A and B groups
group_A = data[data['version'] == 'gate_30']
group_B = data[data['version'] == 'gate_40']

from scipy import stats

# Calculate descriptive statistics for Group A - retention_1
stats_A_1 = stats.describe(group_A['retention_1'])

# Calculate descriptive statistics for Group B - retention_1
stats_B_1 = stats.describe(group_B['retention_1'])

# Calculate descriptive statistics for Group A - retention_7
stats_A_7 = stats.describe(group_A['retention_7'])

# Calculate descriptive statistics for Group B - retention_7
stats_B_7 = stats.describe(group_B['retention_7'])

# Print the descriptive statistics
print("Descriptive Statistics for Group A - retention_1:")
print(stats_A_1)
print()
print("Descriptive Statistics for Group B - retention_1:")
print(stats_B_1)
print()
print("Descriptive Statistics for Group A - retention_7:")
print(stats_A_7)
print()
print("Descriptive Statistics for Group B - retention_7:")
print(stats_B_7)


Descriptive Statistics for Group A - retention_1:
DescribeResult(nobs=39638, minmax=(False, True), mean=0.389096321711489, variance=0.23770637107358544, skewness=0.45494721775378544, kurtosis=-1.7930230290580893)

Descriptive Statistics for Group B - retention_1:
DescribeResult(nobs=40374, minmax=(False, True), mean=0.38336553227324516, variance=0.2364022562473611, skewness=0.4797737441315999, kurtosis=-1.7698171544419459)

Descriptive Statistics for Group A - retention_7:
DescribeResult(nobs=39638, minmax=(False, True), mean=0.11811897673949241, variance=0.10416951209490367, skewness=2.3664276798219186, kurtosis=3.5999799638273506)

Descriptive Statistics for Group B - retention_7:
DescribeResult(nobs=40374, minmax=(False, True), mean=0.10979838509932134, variance=0.09774512072064767, skewness=2.4961857115363766, kurtosis=4.230943106478369)


# Normality and Homogeneity check to choose Statistical test

In [None]:
# Check normality - retention_1
_, p_value_A1 = stats.shapiro(group_A['retention_1'])
_, p_value_B1 = stats.shapiro(group_B['retention_1'])

print("Normality test for retention_1:")
print("Group A - p-value: {:.10f}".format(p_value_A1))
print("Group B - p-value: {:.10f}".format(p_value_B1))
print()

# Check normality - retention_7
_, p_value_A7 = stats.shapiro(group_A['retention_7'])
_, p_value_B7 = stats.shapiro(group_B['retention_7'])

print("Normality test for retention_7:")
print("Group A - p-value: {:.10f}".format(p_value_A7))
print("Group B - p-value: {:.10f}".format(p_value_B7))
print()

# Check normality - sum gamerounds
_, p_value_A_gamerounds = stats.shapiro(group_A['sum_gamerounds'])
_, p_value_B_gamerounds = stats.shapiro(group_B['sum_gamerounds'])

print("Normality test for sum_gamerounds:")
print("Group A - p-value: {:.10f}".format(p_value_A_gamerounds))
print("Group B - p-value: {:.10f}".format(p_value_B_gamerounds))
print()

# Check homogeneity
_, p_value_retention = stats.levene(group_A['retention_1'], group_B['retention_1'],
                                    group_A['retention_7'], group_B['retention_7'])

print("Homogeneity test (Levene's test) for retention variables:")
print("p-value: {:.10f}".format(p_value_retention))
print()

Normality test for retention_1:
Group A - p-value: 0.0000000000
Group B - p-value: 0.0000000000

Normality test for retention_7:
Group A - p-value: 0.0000000000
Group B - p-value: 0.0000000000

Normality test for sum_gamerounds:
Group A - p-value: 0.0000000000
Group B - p-value: 0.0000000000

Homogeneity test (Levene's test) for retention variables:
p-value: 0.0000000000





--> **p-value is too small, less than 0.5 in both test (for both variables), so the data is not normally distributed and not homogeneity**

# Choose Mann Whitney U Test

In [None]:
from scipy.stats import mannwhitneyu

# Assuming you have two dataframes: group_A and group_B

# Extract the retention_1 values for each group
retention_1_group_A = group_A['retention_1']
retention_1_group_B = group_B['retention_1']

# Perform Mann-Whitney U Test on retention_1
statistic_1, p_value_1 = mannwhitneyu(retention_1_group_A, retention_1_group_B)

# Extract the retention_7 values for each group
retention_7_group_A = group_A['retention_7']
retention_7_group_B = group_B['retention_7']

# Perform Mann-Whitney U Test on retention_7
statistic_7, p_value_7 = mannwhitneyu(retention_7_group_A, retention_7_group_B)

# Extract the Sum Gamerounds values for each group
sum_gamerounds_group_A = group_A['sum_gamerounds']
sum_gamerounds_group_B = group_B['sum_gamerounds']

# Perform Mann-Whitney U Test on retention_7
statistic_gamerounds, p_value_gamerounds = mannwhitneyu(sum_gamerounds_group_A, sum_gamerounds_group_B)

# Print the results
print("Mann-Whitney U Test Results for Retention 1:")
print("Statistic:", statistic_1)
print("p-value:", p_value_1)
print("-----------------------------------------")
print("Mann-Whitney U Test Results for Retention 7:")
print("Statistic:", statistic_7)
print("p-value:", p_value_7)
print("-----------------------------------------")
print("Mann-Whitney U Test Results for Sum Gamerounds:")
print("Statistic:", statistic_gamerounds)
print("p-value:", p_value_gamerounds)


Mann-Whitney U Test Results for Retention 1:
Statistic: 804757925.0
p-value: 0.09598457875484477
-----------------------------------------
Mann-Whitney U Test Results for Retention 7:
Statistic: 806830213.0
p-value: 0.00021242240013123805
-----------------------------------------
Mann-Whitney U Test Results for Sum Gamerounds:
Statistic: 806916458.0
p-value: 0.03885510150342266


# Results
- The results showed that the p-value for the sum of game rounds was 0.03, retention after 1 day was 0.09, and retention after 7 days was 0.0002.
- With the given data and a significance level of 0.05, the null hypothesis was clearly rejected for retention after 7 days and the sum of game rounds. 
- The null hypothesis could not be rejected for retention after 1 day, which means the change does not have clear impact on it.