### Data Preperation

#### Importing clean data

In [1]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from matplotlib import pyplot
from numpy import where
from sklearn import metrics
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
def classification_report_csv(report,file):
    df = pd.DataFrame(report).transpose()
    filename = '../../Output_Matrics/Modelling/'+file+'.csv'
    df.to_csv(filename, index=True)
    print(pd.read_csv(filename, index_col=0))

In [None]:
def chart_png(plt,file):
    filename = '../../Output_Charts/Modelling/'+file+'.png'
    plt.savefig(filename, dpi=300)

In [None]:
market_data= pd.read_csv("../../Input_Data/Modelling/clean_market_data.csv")
print("Market_data - Columns")
print(market_data.columns)
print("Market_data - Row counts")
print(market_data.Response.value_counts())

#### Splitting the data to train and test

In [None]:
market_data_X= market_data.drop(labels=["ID","Country","Response"],axis=1)
market_data_Y=market_data["Response"]
print("Market Data X")
display(market_data_X)
print("Market Data Y")
print(market_data_Y)

In [None]:
#Training and Test Data splitting
X_train,X_test,y_train,y_test = train_test_split(market_data_X,market_data_Y,test_size=0.2,random_state=0)

print('X train shape: {}'.format(X_train.shape))
print('X test shape: {}'.format(X_test.shape))
print('Y train shape: {}'.format(y_train.shape))
print('Y test shape: {}'.format(y_test.shape))
print("No of customers accepted the offer in the last campaign i.e 1's {} in training data".format(y_train.value_counts()[1]))
print("No of customers did not accept the offer in the last campaign i.e 0's {} in training data".format(y_train.value_counts()[0]))
print("No of customers accepted the offer in the last campaign i.e 1's {} in test data".format(y_test.value_counts()[1]))
print("No of customers did not accept the offer in the last campaign i.e 0's {} in test data".format(y_test.value_counts()[0]))

#### Applying decision tree algorithm 

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth = 24,random_state=42)
clf.fit(X_train,y_train)
y_predict= clf.predict(X_test)
result_metrics = classification_report(y_test, y_predict, output_dict=True)
classification_report_csv(result_metrics,"DecisionTree")

In [None]:
print("Score:{}".format(clf.score(X_test, y_test)))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0,1])
df_cm = pd.DataFrame(cm, index = [i for i in [0,1]], columns = [i for i in ["Predict_0","Predict_1"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
chart_png(plt,"Confusion_Matrix_Heatmap")
plt.show()

#### In the training data there are total of 1789 in which 1516 are 0's and 273 are 1's 
#### In testing data we have total of 448 records in which 387 are 0's and 61 are 1's
#### Using the decision tree model 355 records in test data are predicted as 0's correctly and 31 are predicted as 1's correctly with an accuracy score of 86%
#### However this model wouldn't perform well on unknown data in the future as the data used for modelling is imbalanced, precision and recall for the customers who accepted the offer is very less i.e just 13 % of the whole data for the positive predictions

In [None]:
ros = RandomOverSampler(random_state=30)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
print("Counter after Random Over Sampler",Counter(y_resampled).items())
print("Y Train:\n",y_train.value_counts())
print("Y Resampled:\n",y_resampled.value_counts())
print("Shape after resampling X_train:{}".format(X_resampled.shape))
print("Shape after resampling y_train:{}".format(y_resampled.shape))
print("No of customers accepted the offer in the last campaign i.e 1's {} in resampled train data".format(y_resampled.value_counts()[1]))
print("No of customers did not accept the offer in the last campaign i.e 0's {} in resampled train data".format(y_resampled.value_counts()[0]))

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth = 24,random_state=42)
clf.fit(X_resampled,y_resampled)
y_predict= clf.predict(X_test)
result_metrics = classification_report(y_test, y_predict, output_dict=True)
classification_report_csv(result_metrics,"Random_Over_Sampler")

#### Precision and Recall for Class 1 is very low, just around 50%
#### There is an oversampling issue still, even after using RandomOverSampler
#### Will try using SMOTE for manipulating tuples for sample (Synthetic Minority Oversampling Technique)

In [None]:
counter = Counter(y_train)
print("Counter before Smote:-\t",counter)
oversample = SMOTE()
X, y = oversample.fit_resample(X_train, y_train)
counter = Counter(y)
print("Counter after Smote:-\t",counter)
print("Counter items :-\t", counter.items())
print("Counter keys :-\t\t", counter.keys())

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth = 24,random_state=42)
clf.fit(X,y)
y_predict= clf.predict(X_test)
result_metrics = classification_report(y_test, y_predict, output_dict=True)
print("Result Matrix of Decision Tree after using SMOTE")
classification_report_csv(result_metrics,"SMOTE")

In [None]:
counter = Counter(y_train)
print("Counter before Smote and Random Under Sampler:-\n",counter)
over = SMOTE(sampling_strategy = "minority")
under = RandomUnderSampler(sampling_strategy = "not minority")
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X, y = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y)
print("Counter after Smote and Random Under Sampler:-\n",counter)
print("X Shape",X.shape)

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth = 24,random_state=42)
clf.fit(X,y)
y_predict= clf.predict(X_test)
result_metrics = classification_report(y_test, y_predict, output_dict=True)
print("Result Matrix of Decision Tree after using SMOTE and Random Under Sampler")
classification_report_csv(result_metrics,"SMOTE and Random Under Sampler")

In [None]:
clf_CVV = DecisionTreeClassifier(criterion='entropy', max_depth = 12,random_state=42)
kvalues =[5,10,15,20,25,30,35,40,45,50]

for k in kvalues:
    kff = KFold(n_splits=k, random_state=None, shuffle=True) # Define the split - into 2 folds 
    kff.get_n_splits(X)
    print("KFold begins for K value:", k)

    for train_index, valid_index in kff.split(X,y):

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        clf_CVV.fit(X_train, y_train)
        y_pred_valid =clf_CVV.predict(X_valid)
        y_pred_train=clf_CVV.predict(X_train)
    
        result_metrics_valid = classification_report(y_valid, y_pred_valid, output_dict=True)
        result_metrics_train = classification_report(y_train, y_pred_train, output_dict=True)
        
        print("==> Result Matrix after using KFolds on KFold Training Data with K value", k)
        classification_report_csv(result_metrics_train,"KFold Training Data with K - "+str(k))
        print("\n")
        print("==> Result Matrix after using KFolds on KFold Validation Data with K value", k)
        classification_report_csv(result_metrics_valid,"KFold Validation Data with K - "+str(k))
        print("\n")

In [None]:
y_pred_test =clf_CVV.predict(X_test)
result_metrics_test = classification_report(y_test, y_pred_test, output_dict=True)
print("Result Matrix after using KFolds on Test")
classification_report_csv(result_metrics_test,"KFold Testing Data")


In [None]:
train_sizes = np.linspace(0.2,0.8,5)  # 5 times 5*3 = 15
train_sizes, train_accuracy, test_accuracy = learning_curve(clf_CVV, X, y, train_sizes = train_sizes, scoring = 'accuracy', cv=10, shuffle=True)
print("Training Sizes \n", train_sizes)
print("\n")
print("Training Accuracy \n", train_accuracy)
print("\n")
print("Testing Accuracy \n",test_accuracy)

In [None]:
train_scores_mean = np.mean(train_accuracy, axis=1)
test_scores_mean = np.mean(test_accuracy, axis=1)

print("Mean of Training Scores", train_scores_mean)
print("Mean of Testing Scores", test_scores_mean)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 3))
    
ax.set_title('A learning curve ')
ax.set_xlabel("Training examples")
ax.set_ylabel("Score")
ax.grid()

ax.plot(train_sizes,train_scores_mean, 'o-', color="r", label="Training score")
ax.plot(train_sizes,test_scores_mean, 'o-', color="g", label="Cross-validation score")
ax.legend(loc="best")
chart_png(plt,"Cross Validation - Learning Curve")
plt.show()

In [None]:
y_pred_test =clf_CVV.predict(X_test)
result_metrics_test = classification_report(y_test, y_pred_test, output_dict=True)
print("Result Matrix after using Cross Validation on Test")
classification_report_csv(result_metrics_test,"Cross Validation on Testing Data")

#### K Nearest Neighbours

In [None]:
KNN = KNeighborsClassifier(n_neighbors= 8 , weights = 'distance')
KNN.fit(X, y)
predicted_labels = KNN.predict(X_test)
KNN.score(X_test, y_test)

In [None]:
for neighbors in range(1,10):
    kfold = KFold(n_splits=10, random_state=None, shuffle=True) # Define the split - into 2 folds 

    kfold.get_n_splits(X)
    print("===========================================================================")
    print("KFold for K Neighbor value:", neighbors)
    
    KNN_Tunning = KNeighborsClassifier(n_neighbors= neighbors , weights = 'distance' )
    for train_index, valid_index in kfold.split(X,y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        KNN_Tunning.fit(X_train, y_train)

        y_pred_valid =KNN_Tunning.predict(X_valid)
        y_pred_train =KNN_Tunning.predict(X_train)

        result_metrics_train = classification_report(y_train, y_pred_train, output_dict=True)
        result_metrics_valid = classification_report(y_valid, y_pred_valid, output_dict=True)

        print("==> Result Matrix after using KFolds on KNeighborsClassifier Training Data with neighbors value", neighbors)
        classification_report_csv(result_metrics_train,"KFold - KNN - Training Data with K - "+ str(neighbors))
        print("\n")
        print("==> Result Matrix after using KFolds on KNeighborsClassifier Validation Data with neighbors value", neighbors)
        classification_report_csv(result_metrics_valid,"KFold - KNN - Validation Data with K - "+str(neighbors))
        print("\n")


In [None]:
KNN_Tunning.score(X_test, y_test)
scores =[]
for k in range(1,50):
    KNN = KNeighborsClassifier(n_neighbors = k, weights = 'distance' )
    KNN.fit(X, y)
    scores.append(KNN.score(X_test, y_test))

plt.plot(range(1,50),scores)
chart_png(plt,"KNN - Tuning Score")
plt.show()

In [None]:
param_grid = {"n_neighbors":np.arange(3,25),"weights":['distance']}
kff = KFold(n_splits=5, random_state=None, shuffle=True)

KNN_grid_CV=KNeighborsClassifier()
grid_search_cv= GridSearchCV(KNN_grid_CV,param_grid,cv=kff)
grid_search_cv.fit(X,y)

print("Best Params")
print(grid_search_cv.best_params_,grid_search_cv.best_score_)

In [None]:
KNN_best=KNeighborsClassifier(n_neighbors = 7, weights = 'distance')
KNN_best.fit(X,y)
KNN_best.score(X_test,y_test)
y_pred=KNN_best.predict(X_test)
result_metrics_test = classification_report(y_test, y_pred, output_dict=True)
print("Result Matrix after using KNeighborsClassifier on Test Data")
classification_report_csv(result_metrics_test,"KNN - Testing Data")

In [None]:
print("Score:{}".format(KNN_best.score(X_test, y_test)))
cm=metrics.confusion_matrix(y_test, y_pred,labels=[0,1])
df_cm = pd.DataFrame(cm, index = [i for i in [0,1]], columns = [i for i in ["Predict : 0","Predict : 1"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
chart_png(plt,"KNN - HeatMap")
plt.show()

#### Random forest classifier

In [None]:
rfcl = RandomForestClassifier(n_estimators = 80, random_state=1,max_features=12)

kvalues =[3,5,7,10]
for k in kvalues:
    kfold = KFold(n_splits=k, random_state=None, shuffle=True) 
    kfold.get_n_splits(X)
    print("===========================================================================")
    print("KFold for Random Forest value:", k)

    for train_index, valid_index in kfold.split(X,y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        rfcl.fit(X_train, y_train)
        y_pred_valid =  rfcl.predict(X_valid)
        y_pred_train =  rfcl.predict(X_train)
        
        result_metrics_train = classification_report(y_train, y_pred_train, output_dict=True)
        result_metrics_valid = classification_report(y_valid, y_pred_valid, output_dict=True)

        print("==> Result Matrix after using KFolds on RandomForestClassifier Training Data with k value", k)
        classification_report_csv(result_metrics_train,"KFold - RF - Training Data with K - "+ str(k))
        print("\n")
        print("==> Result Matrix after using KFolds on RandomForestClassifier Validation Data with k value", k)
        classification_report_csv(result_metrics_valid,"KFold - RF - Validation Data with K - "+str(k))
        print("\n")