## 1. Import the relevant libraries.

In [None]:
import pandas as pd
import missingno as msno
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
!pip install xgboost

## Read and preprocess data

In [None]:
train_df = pd.read_csv("FinalTraining.csv")
train_df.info()

In [None]:
#train_df.fillna(0, inplace=True)

## Divide the dataset into train and validation DataFrames.

In [None]:
train, val = train, val = train_test_split(train_df, test_size=0.20) #random_state=12)

X_train = train.drop(columns=['Classes'])
y_train = train['Classes'].values

x_val = val.drop(columns=['Classes'])
y_val = val['Classes'].values

In [None]:
X_train.shape

In [None]:
x_val.shape

In [None]:
X_train

In [None]:
x_val

## Train a Random Forest model with the following hyperparameters:

In [None]:
rf_params = {
    'n_estimators': 1000,
    'max_features': 'sqrt',
    'min_samples_leaf': 10,
    #'random_state': 11,
    #'n_jobs': -1
}
#insert code here for modelling

rf = RandomForestClassifier(**rf_params)

rf.fit(X_train, y_train)
rf_preds_train = rf.predict(X_train)
rf_preds_val = rf.predict(x_val)

## Train a HistGradientBoosting model with the following hyperparameters:

In [None]:
gbr_params = {
    'n_estimators': 1000,
    'max_features': 'sqrt',
    'min_samples_leaf': 10,
    #'random_state': 11
    #'n_jobs': -1
    
}

hist = HistGradientBoostingClassifier()

hist.fit(X_train, y_train)
hist_preds_train = hist.predict(X_train.values)
hist_preds_val = hist.predict(x_val.values)

## Train a gradient boosting model with the following hyperparameters:

In [None]:
gbr_params = {
    'n_estimators': 1000,
    'max_features': 'sqrt',
    'min_samples_leaf': 10,
    #'random_state': 11
}
#insert code here for modelling

gbr = GradientBoostingClassifier(**gbr_params)


gbr.fit(X_train, y_train)
gbr_preds_train = gbr.predict(X_train.values)
gbr_preds_val = gbr.predict(x_val.values)

## Train a XGB model with the following hyperparameters:

In [None]:
xgb_params = {
    'n_estimators': 1000,
  
    #'random_state': 11
}
#insert code here for modelling

xgb = XGBClassifier(**xgb_params)


xgb.fit(X_train.values, y_train)
xgb_preds_train = xgb.predict(X_train.values)
xgb_preds_val = xgb.predict(x_val.values)



## Train a AdaBoost model with the following hyperparameters:

In [None]:
abr_params = {
    'n_estimators': 1000,
    #'random_state': 11
}
#insert code here for modelling

abr = AdaBoostClassifier(**abr_params)


abr.fit(X_train, y_train)
abr_preds_train = abr.predict(X_train)
abr_preds_val = abr.predict(x_val)

In [None]:
from sklearn.metrics import log_loss
 
# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
# Making the final model using voting classifier
final_model = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('gbr', gbr), ('abr', abr)], voting='soft')
 
# training all the model on the train dataset
final_model.fit(X_train, y_train)
 
# predicting the output on the test dataset
pred_final = final_model.predict_proba(x_val)

 
# printing log loss between actual and predicted value
print(log_loss(y_val, pred_final))

# MODEL EVALUATION

## Confusion Matrix

In [None]:
#Random Forest Classifier
rf_predicted = rf.predict(x_val)
rf_conf_matrix = confusion_matrix(y_val, rf_predicted)
rf_acc_score = accuracy_score(y_val, rf_predicted)
print("confusion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(y_val, rf_predicted))

rf_preds_train = rf.predict(X_train)
rf_preds_val = rf.predict(x_val)

print('Random Forest:\n> Accuracy on training data = {:.4f}\n> Accuracy on validation data = {:.4f}'.format(
    accuracy_score(y_true=y_train, y_pred=rf_preds_train),
    accuracy_score(y_true=y_val, y_pred=rf_preds_val)
))

In [None]:
#XGB Classifier
xgb_predicted = xgb.predict(x_val)
xgb_conf_matrix = confusion_matrix(y_val, xgb_predicted)
xgb_acc_score = accuracy_score(y_val, xgb_predicted)
print("confusion matrix")
print(xgb_conf_matrix)
print("\n")
print("Accuracy of XGB:",xgb_acc_score*100,'\n')
print(classification_report(y_val, xgb_predicted))

xgb_preds_train = xgb.predict(X_train)
xgb_preds_val = xgb.predict(x_val)

print('XGB:\n> Accuracy on training data = {:.4f}\n> Accuracy on validation data = {:.4f}'.format(
    accuracy_score(y_true=y_train, y_pred=xgb_preds_train),
    accuracy_score(y_true=y_val, y_pred=xgb_preds_val)
))


In [None]:
#Gradient Boosting Classifier
gbr_predicted = gbr.predict(x_val)
gbr_conf_matrix = confusion_matrix(y_val, gbr_predicted)
gbr_acc_score = accuracy_score(y_val, gbr_predicted)
print("confusion matrix")
print(gbr_conf_matrix)
print("\n")
print("Accuracy of Gradient Boosting:",gbr_acc_score*100,'\n')
print(classification_report(y_val, gbr_predicted))

gbr.fit(X_train, y_train)
gbr_preds_train = gbr.predict(X_train)
gbr_preds_val = gbr.predict(x_val)

print('Gradient Boosting Classfier:\n> Accuracy on training data = {:.4f}\n> Accuracy on validation data = {:.4f}'.format(
    accuracy_score(y_true=y_train, y_pred=gbr_preds_train),
    accuracy_score(y_true=y_val, y_pred=gbr_preds_val)
))

In [None]:
#AdaBoost Classifier
abr_predicted = abr.predict(x_val)
abr_conf_matrix = confusion_matrix(y_val, abr_predicted)
abr_acc_score = accuracy_score(y_val, abr_predicted)
print("confusion matrix")
print(abr_conf_matrix)
print("\n")
print("Accuracy of AdaBoost:",abr_acc_score*100,'\n')
print(classification_report(y_val,abr_predicted))

abr.fit(X_train, y_train)
abr_preds_train = abr.predict(X_train)
abr_preds_val = abr.predict(x_val)

print('AdaBoost:\n> Accuracy on training data = {:.4f}\n> Accuracy on validation data = {:.4f}'.format(
    accuracy_score(y_true=y_train, y_pred=abr_preds_train),
    accuracy_score(y_true=y_val, y_pred=abr_preds_val)
))

In [None]:
#HistGradientBoost Classifier
hist_predicted = hist.predict(x_val)
hist_conf_matrix = confusion_matrix(y_val, hist_predicted)
hist_acc_score = accuracy_score(y_val, hist_predicted)
print("confusion matrix")
print(hist_conf_matrix)
print("\n")
print("Accuracy of HistGradientBoost:",hist_acc_score*100,'\n')
print(classification_report(y_val,hist_predicted))

hist.fit(X_train, y_train)
hist_preds_train = hist.predict(X_train)
hist_preds_val = hist.predict(x_val)

print('Hist:\n> Accuracy on training data = {:.4f}\n> Accuracy on validation data = {:.4f}'.format(
    accuracy_score(y_true=y_train, y_pred=hist_preds_train),
    accuracy_score(y_true=y_val, y_pred=hist_preds_val)
))

## ROC Curve

In [None]:
rf_fpr, rf_tpr, rf_threshold = roc_curve(y_val,rf_predicted, pos_label=2)                                                             
xgb_fpr, xgb_tpr, xgb_threshold = roc_curve(y_val,xgb_predicted, pos_label=2)
abr_fpr, abr_tpr, abr_threshold = roc_curve(y_val,abr_predicted, pos_label=2)
gbr_fpr, gbr_tpr, gbr_threshold = roc_curve(y_val,gbr_predicted, pos_label=2)
hist_fpr, hist_tpr, hist_threshold = roc_curve(y_val,hist_predicted, pos_label=2) 

sns.set_style('whitegrid')
plt.figure(figsize=(10,5))
plt.title('Receiver Operating Characteristic Curve')
plt.plot(rf_fpr, rf_tpr, label='Random Forest Classifier')
plt.plot(xgb_fpr, xgb_tpr, label='XGB Classifier')
plt.plot(gbr_fpr, gbr_tpr, label='Gradient Boosting Classifier')
plt.plot(abr_fpr, abr_tpr, label='AdaBoost Classifier')
plt.plot(hist_fpr, hist_tpr, label='HistGradientBoosting Classifier')
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend()
plt.show()

# Testing final model on validation dataset

In [None]:
val_df = pd.read_csv("FinalTraining.csv")
val_df.info()

In [None]:
test_data = val_df.drop(columns=['Classes'])
answer_data = val_df['Classes'].values

In [None]:
test_data

In [None]:
answer_data

In [None]:
import pickle

pkl_filename = "FinalXGB.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(xgb, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(test_data, answer_data)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(test_data)


In [None]:
rf.score(test_data, answer_data)

In [None]:
xgb.score(test_data, answer_data)

In [None]:
abr.score(test_data, answer_data)

In [None]:
gbr.score(test_data, answer_data)

In [None]:
hist.score(test_data, answer_data)