<a href="https://colab.research.google.com/github/lokesh89414/A-B-Testing-Analysis/blob/main/A_B_Testing_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

data = pd.read_csv('ab_test_data.csv')
print(data.head())
print(data.isnull().sum())
data.dropna(inplace=True)

X = data.drop(columns=['userid'])
y = data['retention_7']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['sum_gamerounds']
categorical_features = ['version', 'retention_1']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


X_test_with_version = data.drop(columns=['userid', 'retention_7'])
y_test_actual = data['retention_7']


X_test_preprocessed = preprocessor.transform(X_test_with_version)
y_test_pred = model.predict(X_test_preprocessed)

X_test_with_version['predicted_retention'] = y_test_pred

retention_rate_version = X_test_with_version.groupby('version')['predicted_retention'].mean()

print(retention_rate_version)

better_version = retention_rate_version.idxmax()
print(f"The better version is: {better_version}")

joblib.dump(model, 'ab_test_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

   userid  version  sum_gamerounds  retention_1  retention_7
0     116  gate_30               3        False        False
1     337  gate_30              38         True        False
2     377  gate_40             165         True        False
3     483  gate_40               1        False        False
4     488  gate_40             179         True         True
userid            0
version           0
sum_gamerounds    0
retention_1       0
retention_7       0
dtype: int64
Accuracy: 0.8679454484976161
              precision    recall  f1-score   support

       False       0.89      0.95      0.92     14661
        True       0.71      0.49      0.58      3377

    accuracy                           0.87     18038
   macro avg       0.80      0.72      0.75     18038
weighted avg       0.86      0.87      0.86     18038

version
gate_30    0.130694
gate_40    0.126910
Name: predicted_retention, dtype: float64
The better version is: gate_30


['preprocessor.pkl']