In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
from IPython.display import display

In [2]:
data = pd.read_csv('./../data/raw/conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


## Pre-processing pipeline

### Re-split data with all features

All features <br>
Test_set ratio : 20% 

In [4]:
target_label = ['converted']
X = data.drop(target_label, axis=1)
y = data[target_label].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Create transformation pipeline

In [9]:
num_features = X_train.select_dtypes(include=np.number).columns
cat_features = X_train.select_dtypes(include=object).columns
# Transformer for extra features from Date
# transformer for numerical features
num_transformer = Pipeline([
        ('imputer_num', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ])
# transformer for categorical features
cat_transformer = Pipeline([
        ('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
        ('categoricals', cat_transformer, cat_features),
        ('numericals', num_transformer, num_features)
    ],
    remainder = 'drop'
)

In [10]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(5),
    #LinearSVC(),
    #NuSVC(probability=True),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    XGBClassifier(use_label_encoder=False),
    #GradientBoostingClassifier()
    ]


### Train models to pick 1-3 best performances

In [13]:
def predict_compare(model, X, y_true):
    y_pred = model.predict(X)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1

In [14]:
headers = ['obj_clf', 'classifier', 'train_precision', 'train_recall', 'train_f1','test_precision', 'test_recall', 'test_f1']
values = list()
res_pipelines = list()
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessing', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    
    p, r, f1 = predict_compare(pipe, X_train, y_train)
    p_test, r_test, f1_test = predict_compare(pipe, X_test, y_test)
    values.append([classifier, str(classifier), p, r, f1, p_test, r_test, f1_test])
    #print("model score: %.3f" % pipe.score(X_test, y_test))   
    
#print(values)
clf_perf = pd.DataFrame(values, columns=headers)
clf_perf

  return f(**kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(**kwargs)




Unnamed: 0,obj_clf,classifier,train_precision,train_recall,train_f1,test_precision,test_recall,test_f1
0,LogisticRegression(),LogisticRegression(),0.857994,0.6882,0.763774,0.847113,0.694833,0.763454
1,KNeighborsClassifier(),KNeighborsClassifier(),0.857844,0.716198,0.780648,0.814933,0.687298,0.745693
2,"(DecisionTreeClassifier(max_features='auto', r...",RandomForestClassifier(),0.893635,0.728626,0.802738,0.811429,0.687836,0.744538
3,"(DecisionTreeClassifier(max_depth=1, random_st...",AdaBoostClassifier(),0.849975,0.684786,0.75849,0.847167,0.692142,0.761848
4,"XGBClassifier(base_score=0.5, booster='gbtree'...","XGBClassifier(base_score=0.5, booster='gbtree'...",0.862616,0.698033,0.771646,0.838014,0.690527,0.757156


### Feature importances from RandomForest classifier

In [15]:
clf = clf_perf['obj_clf'][2]

In [18]:
clf.feature_importances_

array([0.02307355, 0.00293123, 0.00431924, 0.00451285, 0.0027011 ,
       0.00269106, 0.00281991, 0.11525862, 0.04020932, 0.80148313])

In [25]:
def feature_importances(classifier):
    #cat_encoder_attribs = np.asarray(pipe.named_steps['preprocessing'].named_transformers_['categoricals'].named_steps['onehot'].categories_, dtype=object)
    #cat_attribs = np.concatenate([cat_list for cat_list in cat_encoder_attribs])
    #all_features = np.concatenate([cat_attribs, num_features])
    #sorted(zip(pipe.steps[1][1].feature_importances_, all_features), reverse=True)
    all_features = ['China', 'Germany', 'UK', 'US', 'Ads', 'Direct', 'Seo', 'age','new_user', 'total_pages_visited']
    return sorted(zip(classifier.feature_importances_, all_features), reverse=True)
    

In [24]:
feature_importances(clf)

[(0.8014831272102537, 'total_pages_visited'),
 (0.11525861827652759, 'age'),
 (0.04020932279178694, 'new_user'),
 (0.023073545293313305, 'China'),
 (0.00451284519097652, 'US'),
 (0.004319243701478928, 'UK'),
 (0.0029312324661483913, 'Germany'),
 (0.0028199141888572148, 'Seo'),
 (0.0027010954770602413, 'Ads'),
 (0.0026910554035971318, 'Direct')]

### SearchCV to find best parameters for RandomForestClassifier

In [27]:
ran_forest = Pipeline(steps=[('preprocessing', preprocessor),
                      ('classifier', RandomForestClassifier())])
param_grid = { 
    'classifier__n_estimators': [400],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [10],
    #'classifier__criterion' :['gini', 'entropy']
}

from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(ran_forest, param_grid, n_jobs= 4, cv=3)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)
CV.score(X_test, y_test)


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


{'classifier__max_depth': 10, 'classifier__max_features': 'log2', 'classifier__n_estimators': 400}
0.9859837304097266


0.9855928034296155

In [28]:
print(CV.score(X_test, y_test))

0.9855928034296155


In [29]:
print(f1_score(y_test, CV.predict(X_test)))

0.7552238805970151


In [35]:
y_train = y_train.ravel()
y_test = y_test.ravel()

In [37]:
ada_clf = Pipeline(steps=[('preprocessing', preprocessor),
                      ('classifier', AdaBoostClassifier())])

param_grid = { 
    'classifier__n_estimators': [350, 500],
    'classifier__learning_rate' :[1.0, 1.2]
}
CV = GridSearchCV(ada_clf, param_grid, n_jobs= 4, cv=3)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(f'best score: {CV.best_score_}')


{'classifier__learning_rate': 1.0, 'classifier__n_estimators': 350}
best score: 0.9861813901187716
f1 train: {f1_score(y_train, CV.predict(X_train))}
f1 test: {f1_score(y_test, CV.predict(X_test))}


In [38]:
print(f'f1 train: {f1_score(y_train, CV.predict(X_train))}')
print(f'f1 test: {f1_score(y_test, CV.predict(X_test))}')

f1 train: 0.7629411320184387
f1 test: 0.768141592920354


In [39]:
print(f'recall test: {recall_score(y_test, CV.predict(X_test))}')

recall test: 0.7007534983853606


In [40]:
print(f'precision test: {precision_score(y_test, CV.predict(X_test))}')

precision test: 0.8498694516971279


### Feature selection

In [41]:
data

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0
...,...,...,...,...,...,...
284575,US,36,1,Ads,1,0
284576,US,31,1,Seo,2,0
284577,US,41,1,Seo,5,0
284578,US,31,1,Direct,4,0


In [52]:
bins = [10, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 100]
group_names = ['0-19','20-24','25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-60','60-64', '65-80']
age_categories = pd.cut(X_train['age'], bins, labels=group_names)
X_train['age_categories'] = pd.cut(X_train['age'], bins, labels=group_names)
age_categories
pd.value_counts(X_train['age_categories'])

25-29    51267
30-34    45696
20-24    44027
35-39    32237
0-19     25696
40-44    17834
45-49     7541
50-54     2563
55-60      647
60-64      135
65-80       20
Name: age_categories, dtype: int64

In [49]:
CV.best_estimator_

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['country', 'source'], dtype='object')),
                                                 ('numericals',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
            

In [50]:
Pipeline(steps=[('preprocessing', preprocessor),
                      ('classifier', AdaBoostClassifier(n_estimators=350))])

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['country', 'source'], dtype='object')),
                                                 ('numericals',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
            

In [None]:

preprocessor = ColumnTransformer([
        ('categoricals', cat_transformer, cat_features),
        ('numericals', num_transformer, num_features)
    ],
    remainder = 'drop'
)