In [None]:
# link to the colab notebook
# https://colab.research.google.com/drive/1ULW_nGPK74Y8t4Sp4PksIrJqhf-2yj-t#scrollTo=v3hw3KYKfQzt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import time
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn import tree
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB

In [None]:
!wget https://www.cmi.ac.in/~madhavan/courses/dmml2021apr/assignment1/bank-data.zip
!unzip bank-data.zip

In [None]:
data = pd.read_csv("C:/Users/HP/OneDrive/Desktop/Python Codes/Python DataSets/bank-additional-full.csv",sep=";")

In [None]:
categorical_columns= [col for col in data.columns if data[col].dtype=="O"]
numeric_columns= [col for col in data.columns if data[col].dtype!="O"]

In [None]:
for label in categorical_columns:
      plt.figure(figsize=(20,10))
      Y = data['y']
      total = len(Y)*1.
      ax=sns.countplot(x=label, data=data, hue="y")
      for p in ax.patches:
        ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
  
      ax.yaxis.set_ticks(np.linspace(0, total, 11))
      ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
      ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
  
      plt.show()

In [None]:
# since the data is imbalanced (88.7% yes, and 11.3% no), and we dont want to lose any potential customers,
# thus we focus on attributes that affect the positive response more than negative response

# month affects negative responses more than positive responses (positive response is almost same for all months), so it can be dropped
# both negative and positive response is same for almost all day_of_week
# default seems important but then there is not much variation in default, almost all entries are 'no' and we will replace 'unknown' with 'no' later.
# education also does not affect positive responses much, but we can keep it
# job seems reasonable to keep

In [None]:
# conclusion: categorical columns to remove
#default
#month
#day_of_week

In [None]:
%matplotlib inline
sns.boxplot(data=data,x='y',y='age')
# the both labels 'yes' and 'no' have almost the same centre (median), thus age doesnt affect y much

In [None]:
%matplotlib inline
sns.boxplot(data=data,x='y',y='campaign')

In [None]:
import statistics
max(data['campaign']), min(data['campaign']), statistics.variance(data['campaign']), statistics.median(data['campaign'])

In [None]:
sns.boxplot(data=data[data['campaign']<6],x='y',y='campaign')
# seems like campaign affects y somehow

In [None]:
sns.boxplot(data=data[data['pdays']!=999],x='y',y='pdays')
# pdays also affects target variable

In [None]:
  plt.figure(figsize=(20,10))
  Y = data['y']
  total = len(Y)*1.
  ax=sns.countplot(x='pdays', data=data[data['pdays']==999], hue="y")
  for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))

  ax.yaxis.set_ticks(np.linspace(0, total, 11))
  ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
  ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
  plt.show()

# 8.9% of clients who were not contacted before took the term deposit

In [None]:
sns.boxplot(data=data[data['previous']<3],x='y',y='previous')
# took data=data[data['previous']<3] because outliers were compressing the boxplot making it unreadable

In [None]:
sns.boxplot(data=data,x='y',y='emp.var.rate')
# important

In [None]:
sns.boxplot(data=data,x='y',y='emp.var.rate')

In [None]:
sns.boxplot(data=data,x='y',y='cons.price.idx')

In [None]:
sns.boxplot(data=data,x='y',y='euribor3m')

In [None]:
sns.boxplot(data=data,x='y',y='nr.employed')

In [None]:
sns.boxplot(data=data[data['duration']<1400],x='y',y='duration')
# as written in metadata and visible from the boxplot, duration highly affects the class and is not always obtained for the test data
# thus removing it from test as well as train data

In [None]:
#conclusion: remove 
#age
#default
#month
#day_of_week
#duration

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(), annot=True)

In [None]:
# delete duration column altogether
data.drop(['duration'], axis=1,inplace=True)
# not keeping 'duration' in test data because as given in the txt file, it may or may not be available

In [None]:
# cleaning the training data

In [None]:
# remove features based on the plots above
data.drop(['age','default','month','day_of_week'], axis=1,inplace=True)


# divide training data into yes and no class labels for further process
data_no = data[data['y']=='no']
data_yes = data[data['y']=='yes']


# 1) delete duplicates with class label = 'no'
data_no = data_no.drop_duplicates()

# 2) delete missing data rows with class label = 'no'
missing_rows = []
for i in data_no.index:
      if any(data_no.loc[i]=='unknown'):
        missing_rows.append(i)   # missing_rows = list of indices of rows with missing data
data_no.drop(index=missing_rows,inplace=True)

# 3) for rows with missing data and class label = 'yes', replace unknown with mode of column
cat_col = [col for col in data_yes.columns if data_yes[col].dtype=="O"]
for x in cat_col:
      data_yes[x] = data_yes[x].replace(['unknown'],data[x].mode()[0])

# not deleting duplicate rows with class label = 'yes' to stabilise the data with 'yes' as results.

In [None]:
data = pd.concat([data_no,data_yes])

In [None]:
# encode categorical variables in data
for x in (col for col in data.columns if data[col].dtype=="O"):
      enc= LabelEncoder()
      data[x]= enc.fit_transform(data[x])
      print(x,{labels:encoder for labels, encoder in enumerate(enc.classes_)})

In [None]:
# divide into test and train
x= data.iloc[:, :-1]
y= data.iloc[:, -1:]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, shuffle=True, random_state=1)

In [None]:
  plt.figure(figsize=(20,10))
  Y = y_train
  total = len(Y)*1.
  ax=sns.countplot(x='y', data=y_train, hue="y")
  for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
  
  ax.yaxis.set_ticks(np.linspace(0, total, 11))
  ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
  ax.set_xticklabels(ax.get_xticklabels())
  
  plt.show()

In [None]:
# Random Forest

In [None]:
'''param_grid= {'n_estimators':[500], 'min_samples_split': [5,10,15,20], 'max_depth':[4,5,6,10,15,20]}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 0,scoring='recall')
grid.fit(x_train, y_train) 
 
# print best parameter after tuning 
print(grid.best_params_)'''

In [None]:
# {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 500} = output of above cell

In [None]:
'''param_grid= {'n_estimators':[200], 'min_samples_split': [5,10,15,20], 'max_depth':[4,5,6,10,15,20]}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 0,scoring='recall')
grid.fit(x_train, y_train) 
 
# print best parameter after tuning 
print(grid.best_params_)'''

In [None]:
#{'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200} = output of above cell

In [None]:
start= time.time()

rf= RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=20,bootstrap=True, min_samples_split= 5, min_impurity_split=0.1,
                           oob_score=True, random_state=1)
model1= rf.fit(x_train, y_train)
pred_train= model1.predict(x_train)

end = time.time()
print(f"Runtime of the program is {end - start}")

print()

print('TRAIN')
print(classification_report(y_train,pred_train))
print('recall= ',recall_score(y_train,pred_train))
print()
predictions= model1.predict(x_test)
print('TEST')
print(classification_report(y_test,predictions))
print('recall= ',recall_score(y_test,predictions))
print('f1_score= ',f1_score(y_test,predictions))

In [None]:
model1.oob_score_  #Sort of validation score
#So model generalises well

In [None]:
#Feature Importance
model1.feature_importances_

In [None]:
feature_names = [x for x in x_train.columns]
len(feature_names)

In [None]:
std = np.std([
    tree.feature_importances_ for tree in model1.estimators_], axis=0)


forest_importances = pd.Series(model1.feature_importances_, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances")


In [None]:
# Data frame of feature importance
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = x.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

feature_importances


In [None]:
# Decision Tree 

In [None]:
'''param_grid= { 'min_samples_split': [5,10,15,20], 'max_depth':[2,3,4,5,6,7,8],'min_impurity_split':[0.1,0.2,0.3,0.4],'random_state':[1]}

grid = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, refit = True, verbose = 0,scoring='recall')
grid.fit(x_train, y_train) 
 
# print best parameter after tuning 
print(grid.best_params_)'''

In [None]:
# {'max_depth': 8, 'min_impurity_split': 0.1, 'min_samples_split': 10, 'random_state': 1} = output of above cell

In [None]:
start= time.time()

dtc = tree.DecisionTreeClassifier(max_depth=8,min_samples_split=10,min_impurity_split=0.1, random_state=1)
model2 = dtc.fit(x_train, y_train)
predictions= model2.predict(x_test)
pred_train=model2.predict(x_train)

end = time.time()
print(f"Runtime of the program is {end - start}")

print()

print('TRAIN')
print(classification_report(y_train,pred_train))
print('recall= ',recall_score(y_train,pred_train))
print()
print('TEST')
print(classification_report(y_test,predictions))
print('recall= ',recall_score(y_test,predictions))
print('f1_score= ',f1_score(y_test,predictions))
# support is number of occurence of each label in y_test

In [None]:
'''
plt.figure(figsize=(10,10))

from sklearn.tree import export_graphviz
dot_data = export_graphviz(dtc, out_file=None, filled=True, rounded=True,
                                feature_names=list(x_train.columns),  
                                class_names=['no','yes'])
graph = graphviz.Source(dot_data)  
graph
'''

#Can be run easily in Colab

In [None]:
# Naive Bayes

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(x_train.corr(), annot=True)

In [None]:
# GaussianNB before removing correlated features
start = time.time()

clf = GaussianNB()
model3= clf.fit(x_train,y_train)
predictions= model3.predict(x_test)
pred_train=model3.predict(x_train)

end = time.time()

print(f"Runtime of the program is {end - start}")


print()

print('TRAIN')
print(classification_report(y_train,pred_train))
print('recall= ',recall_score(y_train,pred_train))
print()
print('TEST')
print(classification_report(y_test,predictions))
print('recall= ',recall_score(y_test,predictions))
print('f1_score= ',f1_score(y_test,predictions))
# support is number of occurence of each label in y_test

In [None]:
# we read somewhere that NB can be applied in mixed data in two ways:
# 1) convert all continuous columns to categorical
# 2) mixed model containing both GaussianNB() and CategoricalNB()
# we try to attempt the 1) solution.

In [None]:
# encodin numerical variables in x_train and x_test to apply categorical NB
for x in (col for col in x_train.columns if x_train[col].dtype!="O"):
      enc= LabelEncoder()
      x_train[x]= enc.fit_transform(x_train[x])

for x in (col for col in x_test.columns if x_test[col].dtype!="O"):
      enc= LabelEncoder()
      x_test[x]= enc.fit_transform(x_test[x])

# CategoricalNB before removing correlated features
start = time.time()

clf = CategoricalNB()
model3= clf.fit(x_train,y_train)
predictions= model3.predict(x_test)
pred_train=model3.predict(x_train)

end = time.time()

print(f"Runtime of the program is {end - start}")


print()

print('TRAIN')
print(classification_report(y_train,pred_train))
print('recall= ',recall_score(y_train,pred_train))
print()
print('TEST')
print(classification_report(y_test,predictions))
print('recall= ',recall_score(y_test,predictions))
print('f1_score= ',f1_score(y_test,predictions))
# support is number of occurence of each label in y_test

In [None]:
x_train.drop(data[['cons.price.idx', 'emp.var.rate', 'nr.employed']], axis=1, inplace=True)
x_test.drop(data[['cons.price.idx', 'emp.var.rate', 'nr.employed']], axis=1, inplace=True)

In [None]:
# GaussianNB after removing correlated features
start = time.time()

clf = GaussianNB()
model3= clf.fit(x_train,y_train)
predictions= model3.predict(x_test)
pred_train=model3.predict(x_train)

end = time.time()
print(f"Runtime of the program is {end - start}")
print()
print('TRAIN')
print(classification_report(y_train,pred_train))
print('recall= ',recall_score(y_train,pred_train))
print()
print('TEST')
print(classification_report(y_test,predictions))
print('recall= ',recall_score(y_test,predictions))
print('f1_score= ',f1_score(y_test,predictions))
# support is number of occurence of each label in y_test

In [None]:
# encodin numerical variables in x_train and x_test to apply categorical NB
for x in (col for col in x_train.columns if x_train[col].dtype!="O"):
      enc= LabelEncoder()
      x_train[x]= enc.fit_transform(x_train[x])

for x in (col for col in x_test.columns if x_test[col].dtype!="O"):
      enc= LabelEncoder()
      x_test[x]= enc.fit_transform(x_test[x])

In [None]:
# CategoricalNB after removing correlated features
start = time.time()

clf = CategoricalNB()
model3= clf.fit(x_train,y_train)
predictions= model3.predict(x_test)
pred_train=model3.predict(x_train)

end = time.time()

print(f"Runtime of the program is {end - start}")


print()

print('TRAIN')
print(classification_report(y_train,pred_train))
print('recall= ',recall_score(y_train,pred_train))
print()
print('TEST')
print(classification_report(y_test,predictions))
print('recall= ',recall_score(y_test,predictions))
print('f1_score= ',f1_score(y_test,predictions))
# support is number of occurence of each label in y_test