In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

data = pd.read_csv('./../data/raw/conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


In [7]:
target_label = ['converted']
X = data.drop(target_label, axis=1)
y = data.loc[:, target_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Feature engineering

Unnamed: 0,country,age,new_user,source,total_pages_visited
275577,US,20,0,Ads,6
51684,US,31,1,Seo,1
24711,US,32,0,Seo,2
48680,UK,27,0,Seo,6
55568,China,23,0,Ads,2
...,...,...,...,...,...
119879,US,44,1,Direct,3
259178,China,31,1,Ads,1
131932,UK,34,1,Ads,6
146867,China,43,0,Seo,3


### Create transformation pipeline

In [8]:
import preprocessing

In [9]:
num_features = X.select_dtypes(include=np.number).columns.to_list()
cat_features = X.select_dtypes(include=object).columns.to_list()
preprocessor = preprocessing.preprocess_pipeline(num_features, cat_features)

In [10]:
num_features

['age', 'new_user', 'total_pages_visited']

In [50]:
best_ranforest = RandomForestClassifier(max_depth=10, n_estimators=400, max_features=5)

In [51]:
pipeline = Pipeline(steps=[('preprocessing', preprocessor),
                           ('classifier', best_ranforest)])

In [52]:
X_train

Unnamed: 0,country,age,new_user,source,total_pages_visited
275577,US,20,0,Ads,6
51684,US,31,1,Seo,1
24711,US,32,0,Seo,2
48680,UK,27,0,Seo,6
55568,China,23,0,Ads,2
...,...,...,...,...,...
119879,US,44,1,Direct,3
259178,China,31,1,Ads,1
131932,UK,34,1,Ads,6
146867,China,43,0,Seo,3


In [95]:
best_ranforest.

1

In [53]:
pipeline.fit(X_train, y_train.values.ravel())

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['country', 'source']),
                                                 ('numericals',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                   

In [87]:
f1_score(y_train, pipeline.predict(X_train))

0.7808167285400408

In [56]:
f1_score(y_test, pipeline.predict(X_test))

0.7561627561627562

In [57]:
y_pred = pipeline.predict(X_test)

In [76]:
y_test['pred_converted'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [78]:
pd.concat([X_test[y_test['converted'] !=y_test['pred_converted']], y_test[y_test['converted'] !=y_test['pred_converted']] ], axis=1)

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted,pred_converted
50423,UK,21,0,Ads,11,1,0
110983,UK,19,0,Ads,12,0,1
273519,UK,25,0,Ads,8,1,0
141605,US,36,1,Direct,15,1,0
144795,US,19,1,Direct,9,1,0
...,...,...,...,...,...,...,...
230597,US,20,0,Seo,7,1,0
100522,US,21,0,Direct,13,0,1
238654,US,29,0,Ads,10,1,0
215024,US,25,1,Seo,10,1,0


In [80]:
pipeline.named_steps['classifier'].feature_importances_

array([0.03479743, 0.0021287 , 0.00278626, 0.00364205, 0.00284125,
       0.0029898 , 0.00295471, 0.0479313 , 0.0441341 , 0.85579439])

In [81]:
cat_encoder_attribs = np.asarray(pipeline.named_steps['preprocessing'].named_transformers_['categoricals'].named_steps['onehot'].categories_, dtype=object)

In [82]:
cat_attribs = np.concatenate([cat_list for cat_list in cat_encoder_attribs])

In [83]:
all_features = np.concatenate([cat_attribs, num_features])
all_features

array(['China', 'Germany', 'UK', 'US', 'Ads', 'Direct', 'Seo', 'age',
       'new_user', 'total_pages_visited'], dtype=object)

In [85]:
sorted(zip(pipeline.named_steps['classifier'].feature_importances_, all_features), reverse=True)

[(0.8557943850933213, 'total_pages_visited'),
 (0.04793130130455542, 'age'),
 (0.04413410480637191, 'new_user'),
 (0.034797429550209175, 'China'),
 (0.003642054400293081, 'US'),
 (0.002989799811188505, 'Direct'),
 (0.0029547128387700126, 'Seo'),
 (0.002841251098920018, 'Ads'),
 (0.0027862603429871244, 'UK'),
 (0.0021287007533834766, 'Germany')]

In [46]:
from sklearn.model_selection import cross_val_score

In [86]:
pipeline.named_steps['classifier']

RandomForestClassifier(max_depth=10, max_features=5, n_estimators=400)