In [1]:
import pandas as pd

### import extra package
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#### modelling package
from statsmodels.graphics.mosaicplot import mosaic
import statsmodels
from statsmodels.formula.api import logit
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#### importing evaluation package 
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, auc,\
precision_recall_curve, roc_curve, average_precision_score
from sklearn import metrics

### Data Preparation 

In [6]:
### load data 
user_df = pd.read_csv("users.csv")
sessions_df = pd.read_csv("sessions.csv").drop_duplicates()
iaps_df = pd.read_csv("iaps.csv").drop_duplicates()
spending_df = pd.read_csv("spendevents.csv").drop_duplicates()


### change to datetime format
spending_df["date"] = pd.to_datetime(spending_df["date"])
sessions_df["date"] = pd.to_datetime(sessions_df["date"])
user_df["install_date"] = pd.to_datetime(user_df["install_date"])
iaps_df["date"] = pd.to_datetime(iaps_df["date"])

In [7]:
### extract pyaing users
users_iaps = pd.merge(user_df, iaps_df, on = "user_id")
users_iaps["days_to_first_purchase"] = users_iaps["date"] - users_iaps["install_date"]
users_iaps["days_to_first_purchase"] = users_iaps["days_to_first_purchase"].dt.days
users_iaps["rank"] = users_iaps.groupby(["user_id"])["days_to_first_purchase"].rank(method="first", ascending=True)
users_iaps["rank"] = users_iaps["rank"].astype(int)
paying_users_df = users_iaps[users_iaps["rank"]==1]
first_iap_day_df = paying_users_df[["user_id", "ts", "days_to_first_purchase",\
                                    "install_date"]].rename(columns={"ts": "iap_date"}) #### 1526 paying users
first_iap_day_df.head()

Unnamed: 0,user_id,iap_date,days_to_first_purchase,install_date
0,2,2019-03-07 11:50:37,1,2019-03-06
1,21,2019-03-09 10:18:32,2,2019-03-07
4,36,2019-03-02 23:34:17,0,2019-03-02
10,51,2019-03-02 21:19:39,0,2019-03-02
20,85,2019-04-17 18:26:44,41,2019-03-07


### extract non-paying users 
- Remove non-active players (those are not active within 7 days)
- All users who haven’t converted after 60 days and still active will be labeled as 0.


In [23]:
non_payer_df = user_df[~user_df.user_id.isin(iaps_df["user_id"].unique())] ## total of 21050 non-paying users
session_non_payer_df = pd.merge(non_payer_df, sessions_df, on = "user_id")
session_non_payer_df["days_from_installed"] = session_non_payer_df["date"] - session_non_payer_df["install_date"]
session_non_payer_df["days_from_installed"] = session_non_payer_df["days_from_installed"].dt.days

In [43]:
session_non_payer_df[session_non_payer_df.days_from_installed > 45]["user_id"].nunique() ### 1926 label 0 to train

2463

### remove non-active users 

In [19]:
non_payer_df.head()

Unnamed: 0,user_id,install_date,lang,country,hw_ver,os_ver
0,0,2019-03-01,en,US,"iPhone4,1",9.1
1,1,2019-03-01,en,IN,"iPod5,1",8.1.2
3,3,2019-03-03,nb,NO,"iPhone8,1",9.2.1
4,4,2019-03-03,en,GB,"iPhone5,4",9.2.1
5,5,2019-03-07,en,US,"iPhone5,3",9.2.1


In [20]:
sessions_df["ts"].max()

'2019-05-06 23:35:42'

### Extract label 0 class
- Users who haven't converted after 60 days since installed date.

### Extract label 1 class
- Users who already made their first in-app purchase within 45 days since the date they installed in the app 

In [42]:
label1_users = first_iap_day_df[first_iap_day_df.days_to_first_purchase <= 45]
label1_users["user_id"].nunique()

1493