# Challenge : predict conversions 🏆🏆

### EDA
 - EDA

### PREPROCESSING
 - Preprocessing

### MODELS
 - Decision Tree
 - Adaboost Logistic Regression
 - Adaboost Decision Tree
 - XGBoost
 - SGDClassifier

### MAKE PREDICTIONS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import display

In [None]:
# import the dataset
df = pd.read_csv('conversion_data_train.csv')

In [None]:
# show the most relevants informations about the dataset
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

### EDA

In [None]:
fig = px.histogram(df, x='age', color=df.age,
                  title = 'Ages distribution',
                  color_discrete_sequence=['lightcyan'])
fig.update_layout(title_x = 0.5, 
                      margin=dict(l=50,r=50,b=50,t=50,pad=4),
                      xaxis_title = '',
                      yaxis_title = '',
                      template = 'plotly_dark',
                      )
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                    'paper_bgcolor': 'rgba(0, 0, 0, 0)'}
                      )  

fig.show()

- The majority of users are between 17 and 63 years old.

In [None]:
# We keep only data of users younger then 63 years old 
df = df[df['age'] < 63]

In [None]:
converted = dict(df.converted.groupby(df.converted).count())

fig = px.pie(converted.items(), values= converted.values(), names= converted.keys(), color= converted.keys(),
            title= "Proportion of subscribed newsletters",
             color_discrete_map={'0':'lightcyan',
                                 '1':'royalblue',
                                })
fig.update_traces(textposition = 'outside', textfont_size = 15)             
fig.update_layout(title_x = 0.5, 
                    margin=dict(l=50,r=50,b=50,t=50,pad=4), 
                    template = 'plotly_dark'
                    )   
fig.show()

- Only 3.23 % of users has subscribed to the newsletter.

In [None]:
fig = px.histogram(df, x="converted",
                      title = 'Proportion of new users and the conversion rate',
                      color = 'new_user',
                      histnorm= 'percent',
                      barmode ='group',
                      width= 800,
                      height = 600,
                      text_auto = True,
                      color_discrete_map={'1':'lightcyan',
                                 '0':'royalblue',
                                })
                    
fig.update_layout(title_x = 0.5, 
                      margin=dict(l=50,r=50,b=50,t=50,pad=4),
                      xaxis_title = '',
                      yaxis_title = '',
                      template = 'plotly_dark'
                      )
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                    'paper_bgcolor': 'rgba(0, 0, 0, 0)'}
                      )  

fig.show()

- Only 1.40 % of __new users__ subscribed to the newsletter.

- The newsletter has been subscribed mostly by __regulars users__ (7.19%).

In [None]:
fig = px.histogram(df, x="country",
                      title = 'Proportion of countries and the conversion rate',
                      color = 'converted',
                      histnorm= 'percent',
                      barmode ='group',
                      width= 800,
                      height = 600,
                      text_auto = True,
                      color_discrete_map={'1':'lightcyan',
                                 '0':'royalblue',
                                })
                    
fig.update_layout(title_x = 0.5, 
                      margin=dict(l=50,r=50,b=50,t=50,pad=4),
                      xaxis_title = '',
                      yaxis_title = '',
                      template = 'plotly_dark'
                      )
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                    'paper_bgcolor': 'rgba(0, 0, 0, 0)'}
                      )  

fig.show()

- __US__ is the country with the highest ratio of subscribed newsletters.

- The second highest country is __UK__. However, they have a _subscription rate that is less than the half_ of US subscriptions.

- __China__ has the lowest ratio of subscriptions.

In [None]:
fig = px.histogram(df, x="total_pages_visited", color="converted",
                   marginal="box", # or violin, rug
                   title = 'Proportion of visited pages and the conversion rate',
                   hover_data=df.columns,
                   color_discrete_map={'1':'lightcyan',
                                 '0':'royalblue',
                                })                    
fig.update_layout(title_x = 0.5, 
                      margin=dict(l=50,r=50,b=50,t=50,pad=4),
                      xaxis_title = '',
                      yaxis_title = '',
                      template = 'plotly_dark'
                      )
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                    'paper_bgcolor': 'rgba(0, 0, 0, 0)'}
                      )  

fig.show()

- Users __who do not subscribe__ visit between 1 and 16 pages. Most of them visit until 6 pages before making the decision to not subscribe.

- Users __who subscribe__ consult up to 27 pages and most of them visit at least 7
 pages before subscribe.

In [None]:
fig = px.histogram(df, x="source", color="converted", 
                                  title = 'Proportion of sources and the conversion rate',
                                  color_discrete_map={'0':'lightcyan',
                                                      '1':'royalblue',
                                })

fig.update_layout(title_x = 0.5, 
                      margin=dict(l=50,r=50,b=50,t=50,pad=4),
                      xaxis_title = '',
                      yaxis_title = '',
                      template = 'plotly_dark'
                      )
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                    'paper_bgcolor': 'rgba(0, 0, 0, 0)'}
                      )  

fig.show()

- Most of Users visit the website via the __"Search Engine Optimization"__.

- The ratio between sources and the convertion rate is the same in all three cases.

### PREPROCESSING

In [None]:
# Separate target variable Y from features X
target_name = 'converted'

print("Separating labels from features...")
Y = df.loc[:,target_name]
X = df.drop(target_name, axis = 1) # Keeping all columns

In [None]:
from sklearn.model_selection import train_test_split

# Divide into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify= Y)

In [None]:
from sklearn.pipeline import Pipeline
# Pipeline
numeric_features = [1, 4] # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

In [None]:
# Create pipeline for categorical features
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
categorical_features = [0, 2, 3] # Names of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

In [None]:
# set the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Preprocessings on train set
X_train = preprocessor.fit_transform(X_train)
print('...Done.')

# Preprocessings on test set 
X_test = preprocessor.transform(X_test)
print('...Done.')

# MODELS

- ### Decision Tree

In [None]:
# Perform grid search
print("Grid search...")
decision_tree = DecisionTreeClassifier() # instanciate DecisionTreeClassifier
model_dt = BaggingClassifier(decision_tree)
# Grid of values to be tested
# I left only the best hyperparameters after the first grid search to make calculus faster 
params = {
    'base_estimator__max_depth': [8],
    'base_estimator__min_samples_leaf': [5],
    'base_estimator__min_samples_split': [4],
    'n_estimators': [80]
}
gridsearch = GridSearchCV(model_dt, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train) # we fit on X_train and Y_train
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
# make predictions on X_train and X_test
grid_train_pred = gridsearch.predict(X_train) 
grid_test_pred = gridsearch.predict(X_test)
print("f1-score on train set : ", f1_score(Y_train, grid_train_pred))
print("f1-score on test set : ", f1_score(Y_test, grid_test_pred))

In [None]:
# define variables with predictions on train and test set
grid_train_pred = gridsearch.predict(X_train)
grid_test_pred = gridsearch.predict(X_test)

# define variables with f1 score
score_test = f1_score(Y_test, grid_test_pred)
score_train = f1_score(Y_train, grid_train_pred)

# create a dataset with ours scores
data = [{'model': 'bagging_df', 'f1_score' : score_test, 'set': 'test'},
        {'model': 'bagging_df', 'f1_score' : score_train, 'set': 'train'}
]
df_score = pd.DataFrame(data)
df_score

- ### Adaboost Logistic Regression

In [None]:
print("Grid search...")
logistic_regression = LogisticRegression(max_iter = 1000) # max_iter changed because of convergence warning
model2 = AdaBoostClassifier(logistic_regression)

# Grid of values to be tested
params = {
    'base_estimator__C': [5.0], # base_estimator__ prefix because C is a parameter from LogisticRegression! 
    'n_estimators': [60] # n_estimators is a hyperparameter of the ensemble method
}
gridsearch3 = GridSearchCV(model2, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch3.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch3.best_params_)
print("Best validation accuracy : ", gridsearch3.best_score_)
print()
grid_train_pred3 = gridsearch3.predict(X_train)
grid_test_pred3 = gridsearch3.predict(X_test)
print("f1-score on train set : ", f1_score(Y_train, grid_train_pred3))
print("f1-score on test set : ", f1_score(Y_test, grid_test_pred3))

In [None]:
# add Adabost Logistic Regression scores into the scores dataset
df_score = df_score.append({'model': 'adaboost_lr', 'f1_score': f1_score(Y_test, grid_test_pred3), 'set': 'test'}, ignore_index=True)
df_score = df_score.append({'model': 'adaboost_lr', 'f1_score': f1_score(Y_train, grid_train_pred3), 'set': 'train'}, ignore_index=True)
df_score

- ### Adaboost Decision Tree

In [None]:
print("Grid search...")
decision_tree = DecisionTreeClassifier()
model3 = AdaBoostClassifier(decision_tree)

# Grid of values to be tested
params = {
    'base_estimator__max_depth': [8],
    'base_estimator__min_samples_leaf': [6],
    'base_estimator__min_samples_split': [4],
    'n_estimators': [60]
}
print(params)
gridsearch4 = GridSearchCV(model3, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch4.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch4.best_params_)
print("Best validation accuracy : ", gridsearch4.best_score_)

grid_train_pred4 = gridsearch4.predict(X_train)
grid_test_pred4 = gridsearch4.predict(X_test)
print("f1-score on train set : ", f1_score(Y_train, grid_train_pred4))
print("f1-score on test set : ", f1_score(Y_test, grid_test_pred4))

In [None]:
# add Adabost Decision Tree scores into the scores dataset
df_score = df_score.append({'model': 'adaboost_dt', 'f1_score': f1_score(Y_test, grid_test_pred4), 'set': 'test'}, ignore_index=True)
df_score = df_score.append({'model': 'adaboost_dt', 'f1_score': f1_score(Y_train, grid_train_pred4), 'set': 'train'}, ignore_index=True)
df_score

- ### XGBoost

In [None]:
from xgboost import XGBClassifier

# Perform grid search
print("Grid search...")
xgboost = XGBClassifier()

# Grid of values to be tested
params = {
    'max_depth': [8], # exactly the same role as in scikit-learn
    'min_child_weight': [6], # effect is more or less similar to min_samples_leaf and min_samples_split
    'n_estimators': [60] # exactly the same role as in scikit-learn
}
print(params)
gridsearch6 = GridSearchCV(xgboost, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch6.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch6.best_params_)
print("Best validation accuracy : ", gridsearch6.best_score_)

grid_train_pred6 = gridsearch6.predict(X_train)
grid_test_pred6 = gridsearch6.predict(X_test)
print("f1-score on train set : ", f1_score(Y_train, grid_train_pred6))
print("f1-score on test set : ", f1_score(Y_test, grid_test_pred6))

In [None]:
# add XGBoost scores into the scores dataset
df_score = df_score.append({'model': 'xgboost', 'f1_score': f1_score(Y_test, grid_test_pred6), 'set': 'test'}, ignore_index=True)
df_score = df_score.append({'model': 'xgboost', 'f1_score': f1_score(Y_train, grid_train_pred6), 'set': 'train'}, ignore_index=True)
df_score

- ### SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier


# Perform grid search
print("Grid search...")
classifier = SGDClassifier()

# Grid of values to be tested
params = {
    'alpha' : [0.000000001], # Constant that multiplies the regularization term
    'learning_rate' : ['optimal'], # t0 is chosen by a heuristic proposed by Leon Bottou
    'average' : [True], # computes the averaged SGD weights across all updates and stores the result in the coef_ attribute
    'penalty' : ['l2'], # It's the standard regularizer for linear SVM models
    'epsilon' : [1.5], # Epsilon in the epsilon-insensitive loss functions
    'max_iter' : [100000], # The maximum number of passes over the training data (epochs)
    'early_stopping' : [True], # It terminates training when validation score is not improving
    'random_state': [2, 4, 6, 8, 10],
    'verbose' : [1, 0.5, 0] # The verbosity level 
}

gridsearch7 = GridSearchCV(classifier, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch7.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch7.best_params_)
print("Best validation accuracy : ", gridsearch7.best_score_)

grid_train_pred7 = gridsearch7.predict(X_train)
grid_test_pred7 = gridsearch7.predict(X_test)
print("f1-score on train set : ", f1_score(Y_train, grid_train_pred7))
print("f1-score on test set : ", f1_score(Y_test, grid_test_pred7))

In [None]:
# add SGDC scores into the scores dataset
df_score = df_score.append({'model': 'SGDClassifier', 'f1_score': f1_score(Y_test, grid_test_pred7), 'set': 'test'}, ignore_index=True)
df_score = df_score.append({'model': 'SGDClassifier', 'f1_score': f1_score(Y_train, grid_train_pred7), 'set': 'train'}, ignore_index=True)
df_score

In [None]:
# Let's filter our scores dataset to pick the highest score on test set
df_score = df_score[df_score['set'] == 'test']
df_score = df_score.sort_values(by=['f1_score', 'set'], ascending= False)
df_score

- __SDG Classifier__ is the model with the highest score on test set. We'll pick it to make ours predictions.

# Make predictions

In [None]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

pred = gridsearch7.fit(X,Y)

In [None]:
# Read data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')

# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
features_list = ['country','age','new_user','source','total_pages_visited']
X_without_labels = data_without_labels.loc[:, features_list]

# Convert pandas DataFrames to numpy arrays before using scikit-learn
X_without_labels = X_without_labels.values

In [None]:
# put the same preprocessing as for test set 
print("Encoding categorical features and standardizing numerical features...")
# transform X_without_labels features
X_without_labels = preprocessor.transform(X_without_labels)

data = {
    'converted': gridsearch.predict(X_without_labels)
}
# create a new dataset with predictions
Y_predictions = pd.DataFrame(columns=['converted'],data=data)
# save predictions in a new csv file
Y_predictions.to_csv('conversion_data_test_final_predictions_Nico.csv', index=False)