In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import  RandomForestClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

data = pd.read_csv('./../data/raw/conversion_data_train.csv')
print('Data :', data.shape)

Data : (284580, 6)


### Train test split

In [4]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [55]:
X_train = train_set.drop('converted', axis=1)
y_train = train_set['converted'].copy()

X_test = test_set.drop('converted', axis=1)
y_test = test_set.loc[:, 'converted'].copy()

#train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
print(test_set.shape)
print(X_test.shape)
print(y_test.shape)
print(type(y_test))

(56916, 6)
(56916, 5)
(56916,)
<class 'pandas.core.series.Series'>


### Create transformation pipeline

In [8]:
import preprocessing

In [9]:
num_features = X_train.select_dtypes(include=np.number).columns.to_list()
cat_features = X_train.select_dtypes(include=object).columns.to_list()
preprocessor = preprocessing.preprocess_pipeline(num_features, cat_features)

best_ranforest = RandomForestClassifier(max_depth=10, n_estimators=400, max_features=5)

In [11]:
pipeline = Pipeline(steps=[('preprocessing', preprocessor),
                           ('classifier', best_ranforest)])

#### SearchCV for best parameters

In [20]:
random_forest = Pipeline(steps=[('preprocessing', preprocessor),
                      ('classifier', RandomForestClassifier())])
param_grid = { 
    'classifier__n_estimators': [10, 100, 300],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [6,10,12],
    #'classifier__criterion' :['gini', 'entropy']
}

from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(random_forest, param_grid, n_jobs= 4, cv=3)
                  
CV.fit(X_train, y_train.ravel())

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('categoricals',
                                                                         Pipeline(steps=[('imputer_cat',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['country',
                                                                          'source']),
                                                                        ('numericals',
                                                                         Pipeline(steps=[('imputer_num',
  

In [23]:

print(f'Best params: {CV.best_params_}')  
print(f'Best score: {CV.best_score_}')
print(f'Best estimator: {CV.best_estimator_}')

#CV.score(X_test, y_test.values.ravel())

Best params: {'classifier__max_depth': 10, 'classifier__max_features': 'auto', 'classifier__n_estimators': 300}
Best score: 0.9859881228477053
Best estimator: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['country', 'source']),
                                                 ('numericals',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                         

### Best classifier

In [24]:
best_classifier = CV.best_estimator_

In [57]:
best_classifier.fit(X_train, y_train.ravel())

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['country', 'source']),
                                                 ('numericals',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                   

Train: F1 score

In [58]:
f1_score(y_train, best_classifier.predict(X_train))

0.7804359383306751

Validation: F1 score

In [59]:
f1_score(y_test, best_classifier.predict(X_test))

0.7542448614834674

> Best model has F1 score for the train set 0.78 and 0.75 for the test set

In [60]:
y_pred = best_classifier.predict(X_test)

In [61]:
test_set2 = X_test.copy()
test_set2['y_true'] = y_test
test_set2['y_pred'] = y_pred

In [62]:
test_set2[test_set2['y_true']!=test_set2['y_pred']]

Unnamed: 0,country,age,new_user,source,total_pages_visited,y_true,y_pred
110983,UK,19,0,Ads,12,0,1
273519,UK,25,0,Ads,8,1,0
141605,US,36,1,Direct,15,1,0
144795,US,19,1,Direct,9,1,0
207311,UK,30,1,Direct,13,1,0
...,...,...,...,...,...,...,...
230597,US,20,0,Seo,7,1,0
100522,US,21,0,Direct,13,0,1
238654,US,29,0,Ads,10,1,0
215024,US,25,1,Seo,10,1,0


### Feature importances

In [63]:
best_classifier.named_steps['classifier'].feature_importances_

array([0.02539296, 0.00243353, 0.00336521, 0.00363165, 0.00165936,
       0.00170853, 0.00172472, 0.0470345 , 0.04277524, 0.8702743 ])

In [64]:
cat_encoder_attribs = np.asarray(best_classifier.named_steps['preprocessing'].named_transformers_['categoricals'].named_steps['onehot'].categories_, dtype=object)

In [65]:
cat_attribs = np.concatenate([cat_list for cat_list in cat_encoder_attribs])

In [66]:
all_features = np.concatenate([cat_attribs, num_features])
all_features

array(['China', 'Germany', 'UK', 'US', 'Ads', 'Direct', 'Seo', 'age',
       'new_user', 'total_pages_visited'], dtype=object)

In [67]:
sorted(zip(pipeline.named_steps['classifier'].feature_importances_, all_features), reverse=True)

[(0.8570312549081754, 'total_pages_visited'),
 (0.047634008843314375, 'age'),
 (0.043586663192660954, 'new_user'),
 (0.03448674536506501, 'China'),
 (0.003439760258732291, 'US'),
 (0.002976700290983001, 'Direct'),
 (0.0029130269013013265, 'UK'),
 (0.002904866609878696, 'Seo'),
 (0.0028754834923487922, 'Ads'),
 (0.002151490137540174, 'Germany')]

In [68]:
from sklearn.model_selection import cross_val_score

In [69]:
recall_score(y_test, best_classifier.predict(X_test))

0.6813778256189451

## Training with whole data set

In [71]:
data

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0
...,...,...,...,...,...,...
284575,US,36,1,Ads,1,0
284576,US,31,1,Seo,2,0
284577,US,41,1,Seo,5,0
284578,US,31,1,Direct,4,0


In [72]:
X_train = data.drop('converted', axis=1)
y_train = data['converted'].copy()
    
best_classifier.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['country', 'source']),
                                                 ('numericals',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                   

F1 score

In [73]:
f1_score(y_train, best_classifier.predict(X_train))

0.7792285937782056

> There were no gain in f1_socre when training with whole data set

### Save model

In [75]:
import joblib

In [76]:
joblib.dump(best_classifier, '../models/my_classifier.pkl')

['../models/my_classifier.pkl']