In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors, linear_model
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.svm import SVC
# %matplotlib inline

In [5]:
bank_addidtional_full = "bank-additional-full.csv"

data = pd.read_csv('data/bank_addidtional_full', sep = ';')
df = pd.DataFrame(data)
df.head(10)

FileNotFoundError: File b'data/bank_addidtional_full' does not exist

Checking for missing values. The data is complete and there is no missing values

In [None]:
df.isnull().sum()

Dropping Catogerical columns. The 'duration' columns was dropped as well since the output varaiable y is highly dependent on the duration. According to the UCI website, this input 'duration' should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

In [None]:
df.drop(['job','duration', 'day_of_week','marital','education','default', 'housing','loan', 'contact', 'month', 'poutcome'], axis=1, inplace=True)
#df.drop(['job', 'marital','education','default', 'housing','loan', 'contact', 'month', 'poutcome'], axis=1, inplace=True)

In [None]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [None]:
df.head(10)

In [None]:
sns.pairplot(df,hue='y',palette='coolwarm')

# KNN to classify supervised data

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(df.drop('y',axis=1))

In [None]:
scaled_features = scaler.transform(df.drop('y',axis=1))

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

Splitting the data into train and test data. The test data size chosen to be 30%.

In [None]:
X = df_feat.loc[:,]
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30,random_state=42)

Instantiating the KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

Training the model

In [None]:
knn.fit(X_train,y_train)

Predict the target

In [None]:
pred = knn.predict(X_test)

Evaluating the performance of the model

In [None]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print (accuracy_score(y_test, pred))

Improving the model by choosing a better k value. A plot has to be constructed (elbow method plot) for the error rate Vs. K Value (range from 1 to 40)

In [None]:
error_rate = []


for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# FIRST A QUICK COMPARISON TO OUR ORIGINAL K=1
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

# NOW WITH K=22
knn = KNeighborsClassifier(n_neighbors=22)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('\n')
print('WITH K=22')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
print (accuracy_score(y_test, pred))

Based on the classification report above, we see that with K=1 the precision of predicting target '1' or 'yes'was 31%. The elbow plot above was used to determine the value of k were the error rate is minimum. Then we fit our model to the train data with the new chosen k value (23) and predict the target using the test data. We see a significant improvement with the new k value (23) as comapred to k =1. The precison of predicting target of 1 doubled from 34% to 64%. The weighted avg for precision and recall improved from 85% and 85% to 88% and 90% respectively.   

In [None]:
for i in range(X_test.shape[0]):   
    if pred[i] == 1:
        plt0, = plt.plot(X_test.iloc[i, 0], X_test.iloc[i, 1], 'bs', label='YES') 
    else:
        plt1, = plt.plot(X_test.iloc[i, 0], X_test.iloc[i, 1], 'g^', label='NO') 
      
plt.legend(handles = [plt0, plt1])
plt.xlabel('Age')
plt.ylabel('Campaign')

In [None]:
#emp.var.rate Vs campaign
for i in range(X_test.shape[0]):   
    if pred[i] == 1:
        plt0, = plt.plot(X_test.iloc[i, 4], X_test.iloc[i, 1], 'bs', label='YES') 
    else:
        plt1, = plt.plot(X_test.iloc[i, 4], X_test.iloc[i, 1], 'g^', label='NO') 
        
plt.legend(handles = [plt0, plt1])
plt.xlabel('employment variation rate ')
plt.ylabel('Campaign')

In [None]:
# previous vs campaign
for i in range(X_test.shape[0]):   
    if pred[i] == 1:
        plt0, = plt.plot(X_test.iloc[i, 3], X_test.iloc[i, 1], 'bs', label='YES') 
    else:
        plt1, = plt.plot(X_test.iloc[i, 3], X_test.iloc[i, 1], 'g^', label='NO') 
        
plt.legend(handles = [plt0, plt1])
plt.xlabel('previous')
plt.ylabel('Campaign')

From the plots above, we see that there is no obvious sepration between any of the two features plotted and the predicted outcome. This could possibly mean that the prediction can be done with more features provided and it would be very hard to predict the target given two features only. However, from the campaign vs previous graph, we see that there is a better separation between the two classes "YES" and "NO" which mean that these two features can possibly be used to predict the target. 


# SVM to classify supervised data

In [None]:
bank_addidtional = "bank-additional.csv"

data_SVM = pd.read_csv(bank_addidtional, sep = ';')
dfA = pd.DataFrame(data_SVM)
dfA.head(10)

In [None]:
dfA.drop(['job','duration', 'day_of_week','marital','education','default', 'housing','loan', 'contact', 'month', 'poutcome'], axis=1, inplace=True)
dfA['y'] = dfA['y'].map({'yes': 1, 'no': 0})


In [None]:
dfA.head(10)

In [None]:
scaler.fit(dfA.drop('y',axis=1))
scaled_features = scaler.transform(dfA.drop('y',axis=1))

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=dfA.columns[:-1])
df_feat.head()

In [None]:
X = df_feat
y = dfA['y']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=42)
# X_test.head()

In [None]:
X_test.head()
X_train

## Predictions and Evaluations

Now, predicting using the trained model.

In [None]:
model = SVC(kernel='linear')
model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

Comparing the classification report between KNN and SVM, we see that the SVM had a better accuracy predicting overall. The accuracy of predciting the Yes (1's) increased by 6% from 63% to 69%. An overall accuracy of 89% is still means that there has could more improvements made.

In [None]:
for i in range(X_test.shape[0]):   
    if predictions[i] == 1:
        plt0, = plt.plot(X_test.iloc[i, 0], X_test.iloc[i, 1], 'bs', label='YES') 
    else:
        plt1, = plt.plot(X_test.iloc[i, 0], X_test.iloc[i, 1], 'g^', label='NO') 
        
plt.legend(handles = [plt0, plt1])
plt.xlabel('Age')
plt.ylabel('Campaign')

In [None]:
#emp.var.rate Vs campaign
for i in range(X_test.shape[0]):   
    if predictions[i] == 1:
        plt0, = plt.plot(X_test.iloc[i, 4], X_test.iloc[i, 1], 'bs', label='YES') 
    else:
        plt1, = plt.plot(X_test.iloc[i, 4], X_test.iloc[i, 1], 'g^', label='NO') 
        
plt.legend(handles = [plt0, plt1])
plt.xlabel('employment variation rate ')
plt.ylabel('Campaign')

In [None]:
# employemnet variation rate vs campaign
for i in range(X_test.shape[0]):   
    if predictions[i] == 1:
        plt0, = plt.plot(X_test.iloc[i, 3], X_test.iloc[i, 1], 'bs', label='YES') 
    else:
        plt1, = plt.plot(X_test.iloc[i, 3], X_test.iloc[i, 1], 'g^', label='NO') 
        
plt.legend(handles = [plt0, plt1])
plt.xlabel('previous')
plt.ylabel('Campaign')

From the plots above, we that there is no obvious sepration between any of the two features plotted and the predicted outcome. This could possibly mean that the prediction can be done with more features provided and it would be very hard to predict the target given two features only. However, from the campaign vs previous graph, we see that there is a separation between the two classes "YES" and "NO" which mean that these two features can be used to predict the outcome. 

# SVM with RBF kernal

In [None]:
model = SVC(kernel='rbf')
model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

Comparing the accuracy of the SVM with linear kernel to the SVM with RBF kernel, we see that the accuracy of predicting the 1's (YES) imporved by 7% (from 69% to 76%) and the overal accuracy increased by 1% (from 88% to 89%).