<h1 align="center">MSIN0114: Business Analytics Consulting Project</h1>
<h2 align="center">Profitability of Client X projects: run 4</h2>

## Notebook Setup

In [1]:
# Essentials
import pandas as pd
from pandas import Series, DataFrame
from pandas.api.types import CategoricalDtype
pd.options.display.max_columns = None
import sqlite3
import pyodbc
import numpy as np; np.random.seed(1)

# Image creation and display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
from matplotlib import pyplot
import plotly.express as px
import plotly.graph_objects as go
from matplotlib.ticker import FuncFormatter
from yellowbrick.model_selection import FeatureImportances

# Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 

from sklearn.decomposition import PCA

# Metrics of accuracy
from numpy import mean
from numpy import std
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from pycm import *
import imbalanced_ensemble as imbens
from imbalanced_ensemble.ensemble.base import sort_dict_by_key
from collections import Counter

# Fine-tuning and enseble learning
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV

# Other
import itertools as it
import io
import os
os.sys.path
import sys
import glob
import concurrent.futures
from __future__ import print_function
import binascii
import struct
from PIL import Image
import scipy
import scipy.misc
import scipy.cluster
import datetime, time
import functools, operator
from datetime import datetime
from numpy.random import seed
from numpy.random import randn
from numpy import percentile

In [2]:
df = pd.read_csv('csv-files/resampled_compact_data.csv')

## Data splitting and training

In [3]:
Y = df[['Profit_Class']]
X = df.drop(columns = ['Rec_Class', 'Profit_Class'])
X1 = pd.DataFrame(preprocessing.normalize(X))
X1.columns = X.columns
X = X1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=1, stratify = Y)


In [4]:
# Logistic regression
log = LogisticRegression(random_state = 1, max_iter = 30000)
log.fit(X_train, Y_train.values.ravel())
log_y_pred=log.predict(X_test)

# Ridge regression
rdg = RidgeClassifier(alpha=1.0, random_state = 1, max_iter = 30000)
rdg.fit(X_train, Y_train.values.ravel())
rdg_y_pred=rdg.predict(X_test)

# k-Neighbours
np.random.seed(1)
knn_100 = KNeighborsClassifier(n_neighbors=100)
knn_100.fit(X_train, Y_train.values.ravel())
knn_100_y_pred = knn_100.predict(X_test)

# Decision tree classifier
dtc = DecisionTreeClassifier(random_state = 1)
dtc = dtc.fit(X_train, Y_train.values.ravel())
dtc_y_pred = dtc.predict(X_test)

# Tuned random forest classifier
rfc_tuned = RandomForestClassifier(max_depth=60, min_samples_leaf=4, min_samples_split=10,
                       n_estimators=1788, random_state=1)
rfc_tuned.fit(X_train, Y_train.values.ravel())

# XGBoost classifier
xgbc = XGBClassifier(n_estimators=100, learning_rate=0.05, booster='gbtree', random_state = 1, eval_metric='mlogloss', objective='binary:logistic', use_label_encoder=False)
xgbc.fit(X_train, Y_train.values.ravel())
xgbc_y_pred=xgbc.predict(X_test)

# Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, Y_train.values.ravel())
gnb_y_pred = gnb.predict(X_test)

# Linear discriminant analysis
lda = LinearDiscriminantAnalysis(n_components = 1)
lda.fit(X_train, Y_train.values.ravel())
lda_y_pred = lda.predict(X_test)

# Quadratic discriminant analysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, Y_train.values.ravel())
qda_y_pred = qda.predict(X_test)

# Tuned support vector machine
svm_tuned = SVC(kernel='rbf', C = 2, gamma = 1, random_state = 1, probability=True)
svm_tuned.fit(X_train, Y_train.values.ravel())
svm_y_pred = svm_tuned.predict(X_test)

# Soft voting classifier
soft_voting = VotingClassifier(
    estimators=[('xgbc', xgbc), ('rfc_t', rfc_tuned)],
    voting='soft')
soft_voting.fit(X_train, Y_train.values.ravel())
sv_y_pred = soft_voting.predict(X_test)

## Stacking

#### 8.2.7  <a class="anchor" id="8_2_7"></a> Top 3 models

See results for **SV** in pr_run_1, **base XGBC** in pr_run_2.

In [6]:
# Get a stacking ensemble of models
def get_stacking():
	# Define the base models
	level3 = list()
	level3.append(('rfc_t', rfc_tuned))
	level3.append(('xgbc', xgbc))
	level3.append(('sv', soft_voting))

	# Define the stacking ensemble
	model = StackingClassifier(estimators=level3, final_estimator=rfc_tuned, cv=5)
	return model

# Define the base models separately
level3 = list()
level3.append(('xgbc', xgbc))
level3.append(('sv', soft_voting))
level3.append(('stacking', get_stacking()))

In [7]:
# Define the model
stack3_rfc_t = StackingClassifier(estimators=level3, final_estimator=rfc_tuned, cv=5)

# Fit the model on all available data
stack3_rfc_t = stack3_rfc_t.fit(X, Y.values.ravel())

# Predict the response for test set
stack3_rfc_t_y_pred = stack3_rfc_t.predict(X_test)

In [8]:
# Accuracy measures
print('Accuracy score with 3 models learnt on tuned RFC: ' + str(round(metrics.accuracy_score(Y_test, np.round(stack3_rfc_t_y_pred)), 3)*100)+'%')
print('Recall score  with 3 models learnt on tuned RFC: ' + str(round(metrics.recall_score(Y_test, np.round(stack3_rfc_t_y_pred), average='weighted'), 3)*100)+'%')
print('Precision score  with 3 models learnt on tuned RFC: ' + str(round(metrics.precision_score(Y_test, np.round(stack3_rfc_t_y_pred), average='weighted', zero_division=1), 3)*100)+'%')
print('F1 score with 3 models learnt on stuned RFC: ' + str(round(metrics.f1_score(Y_test, np.round(stack3_rfc_t_y_pred), average='weighted'), 3)*100)+'%')

Accuracy score with 3 models learnt on tuned RFC: 81.39999999999999%
Recall score  with 3 models learnt on tuned RFC: 81.39999999999999%
Precision score  with 3 models learnt on tuned RFC: 81.69999999999999%
F1 score with 3 models learnt on stuned RFC: 81.39999999999999%


#### 8.2.8  <a class="anchor" id="8_2_8"></a> Top 2 models

See results for **SV with tuned RFC** in pr_run_1, **SV with XGBC** in pr_run_2, **base XGBC** in pr_run_3, **SV with tuned SVM** in pr_run_5.

In [9]:
def get_stacking():
	# Define the base models
	level2 = list()
	level2.append(('rfc_t', rfc_tuned))
	level2.append(('sv', soft_voting))

	# Define the stacking ensemble
	model = StackingClassifier(estimators=level2, final_estimator=rfc_tuned, cv=5)
	return model

# Define the base models separately
level2 = list()
level2.append(('rfc_t', rfc_tuned))
level2.append(('sv', soft_voting))
level2.append(('stacking', get_stacking()))

In [10]:
# Define the model
stack2_rfc_t = StackingClassifier(estimators=level2, final_estimator=rfc_tuned, cv=5)

# Fit the model on all available data
stack2_rfc_t = stack2_rfc_t.fit(X, Y.values.ravel())

# Predict the response for test set
stack2_rfc_t_y_pred = stack2_rfc_t.predict(X_test)

In [12]:
# Accuracy measures
print('Accuracy score with 2 models learnt on tuned RFC: ' + str(round(metrics.accuracy_score(Y_test, np.round(stack2_rfc_t_y_pred)), 3)*100)+'%')
print('Recall score  with 2 models learnt on tuned RFC: ' + str(round(metrics.recall_score(Y_test, np.round(stack2_rfc_t_y_pred), average='weighted'), 3)*100)+'%')
print('Precision score  with 2 models learnt on tuned RFC: ' + str(round(metrics.precision_score(Y_test, np.round(stack2_rfc_t_y_pred), average='weighted', zero_division=1), 3)*100)+'%')
print('F1 score with 2 models learnt on tuned RFC: ' + str(round(metrics.f1_score(Y_test, np.round(stack2_rfc_t_y_pred), average='weighted'), 3)*100)+'%')

Accuracy score with 2 models learnt on tuned RFC: 79.4%
Recall score  with 2 models learnt on tuned RFC: 79.4%
Precision score  with 2 models learnt on tuned RFC: 79.60000000000001%
F1 score with 2 models learnt on tuned RFC: 79.4%
