In [1]:
#Import the necessary data manipulation and visualization libraries

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
#Import the machine learning algorithms from sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

In [3]:
#Import some utility and metric classes

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold, cross_val_score



In [4]:
#Defining the header since the data does not have any defined and importing the dataset

headers = ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10","A11","A12","A13","A14","A15","A16"]

df = pd.read_csv("data/data.csv", header=None, names=headers, na_values='?')

In [5]:
#Explore the data.

df.shape

(690, 16)

In [6]:
#Explore the field types.

df.dtypes

A1      object
A2     float64
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14    float64
A15      int64
A16     object
dtype: object

In [7]:
#View the format of the data.

df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [8]:
#Checking how many null values are in each column.

df.isnull().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [9]:
#Unique function is used to identify the unique values in the 'A1' field.

df.A1.unique()

array(['b', 'a', nan], dtype=object)

In [10]:
#View the number of records for each attribute

df.groupby(['A1'])\
  .size()\
  .sort_values(ascending=False)  

A1
b    468
a    210
dtype: int64

In [11]:
df_data = df.fillna({"A1": "b"})

In [12]:
df_data.A1.unique()

array(['b', 'a'], dtype=object)

In [13]:
df_data.mean()

A2       31.568171
A3        4.758725
A8        2.223406
A11       2.400000
A14     184.014771
A15    1017.385507
dtype: float64

In [14]:
df2 = df_data.fillna(df_data.mean())
df2.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [15]:
categorical = df2.dtypes[df2.dtypes == "object"].index
categorical

Index(['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13', 'A16'], dtype='object')

In [29]:
for c in categorical:
    df2[c] = df2[c].astype('category')
    df2[c] = df2[c].cat.codes
df2.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,30.83,0.0,2,1,13,8,1.25,1,1,1,0,0,202.0,0,0
1,0,58.67,4.46,2,1,11,4,3.04,1,1,6,0,0,43.0,560,0
2,0,24.5,0.5,2,1,11,4,1.5,1,0,0,0,0,280.0,824,0
3,1,27.83,1.54,2,1,13,8,3.75,1,1,5,1,0,100.0,3,0
4,1,20.17,5.625,2,1,13,8,1.71,1,0,0,0,2,120.0,0,0


In [30]:
df2.dtypes

A1        int8
A2     float64
A3     float64
A4        int8
A5        int8
A6        int8
A7        int8
A8     float64
A9        int8
A10       int8
A11      int64
A12       int8
A13       int8
A14    float64
A15      int64
A16       int8
dtype: object

In [31]:
train, test = train_test_split(df2, test_size=0.2, train_size=0.8)
test.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
83,0,31.568171,3.5,2,1,4,8,3.0,1,0,0,1,0,300.0,0,1
634,0,23.75,0.71,2,1,13,8,0.25,0,1,1,1,0,240.0,4,1
241,1,48.25,25.085,2,1,13,8,1.75,1,1,3,0,0,120.0,14,0
361,1,23.08,2.5,2,1,6,3,0.085,0,0,0,1,0,100.0,4208,1
534,1,31.83,2.5,2,1,1,8,7.5,1,0,0,1,0,523.0,0,1


In [32]:
X_train = train.drop("A16", axis=1)
Y_train = train["A16"]
X_test = test.drop("A16", axis=1)
Y_test = test["A16"]
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((552, 15), (552,), (138, 15), (138,))

In [33]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)

pca = PCA(0.95)

pca.fit(X_train)

pca.n_components_

X_train = pca.transform(X_train)

X_train = pd.DataFrame(X_train)

In [34]:
scaler = StandardScaler()

scaler.fit(X_test)

X_test = scaler.transform(X_test)

pca = PCA(0.95)

pca.fit(X_test)

pca.n_components_

X_test = pca.transform(X_test)

X_test = pd.DataFrame(X_test)

In [35]:
#Viewing the data the standardisation of the data can be seen

X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.154066,-1.099182,0.09649,-0.536719,0.691792,-1.463057,0.764202,0.274432,0.728335,-1.253688,-0.395577,-0.62777,-0.077198
1,-0.512909,-1.185816,-0.829938,-1.64436,0.240117,0.374665,0.968118,-0.372726,0.619781,0.967067,0.685088,-1.007659,-0.951712
2,2.33719,0.82399,-1.105012,0.208666,-2.705801,0.749991,0.383773,-0.33575,0.290809,-1.57315,1.843614,1.322389,0.018513
3,-1.15215,-0.583063,0.140183,0.903135,0.738279,-0.295624,0.171082,-0.675908,0.864817,1.370806,-0.337759,1.039671,-0.291874
4,-0.252606,-0.87557,2.002492,-0.36768,0.110207,-1.550919,-0.755588,0.259206,1.36499,-1.143088,-0.355515,-0.397487,1.103045


In [36]:
# Applying Logistics Regression to the Standardised Data

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_log

75.359999999999999

In [37]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_svc

72.459999999999994

In [38]:
# KNN Classification method applied

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_knn

73.189999999999998

In [39]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_gaussian

73.909999999999997

In [40]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_perceptron



67.390000000000001

In [41]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_linear_svc

76.810000000000002

In [42]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_sgd



74.640000000000001

In [43]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_decision_tree

68.120000000000005

In [44]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_random_forest

75.359999999999999

In [45]:
#Comparison of the various classification methods.

models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
7,Linear SVC,76.81
2,Logistic Regression,75.36
3,Random Forest,75.36
6,Stochastic Gradient Decent,74.64
4,Naive Bayes,73.91
1,KNN,73.19
0,Support Vector Machines,72.46
8,Decision Tree,68.12
5,Perceptron,67.39


In [46]:
#Applying Random Forest Classifier to the data.

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

X_train, Y_train = make_classification(n_samples=138, n_features=13, 
                                     n_informative=2, n_redundant=2,
                                     random_state=None, shuffle=True)

clf = RandomForestClassifier(max_depth=None, random_state=None)

clf.fit(X_train, Y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', 
           max_depth=None ,max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

print(clf.feature_importances_)

print(clf.predict(X_test))

[ 0.02329632  0.06055277  0.02317051  0.03781209  0.02846782  0.03275956
  0.01255523  0.04407471  0.1067085   0.03065067  0.4380849   0.06353377
  0.09833315]
[0 1 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1
 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0 0 1 0
 0 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 0 1 1 1 1 0 0 0 0
 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 1 1 0 1 1 1 0]


In [47]:
Y_pred = clf.predict(X_test)
acc_log = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_log

57.25

In [48]:
#Performing Recall on the Reclassified data

from sklearn.metrics import recall_score

y_true = Y_train
y_pred = clf.predict(X_test)
recall_score(y_true, y_pred, average=None)

array([ 0.46376812,  0.4057971 ])

In [49]:
acc_log = round(recall_score(y_true,y_pred) * 100, 2)
acc_log

40.579999999999998

In [50]:
#Performing Randomised Search on the data to determine 
#the best parameters for the RandomForest Classifier

print(__doc__)

import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# build a classifier
clf = RandomForestClassifier(n_estimators=10)


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"n_estimators": sp_randint(1, 20),
              "max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X_train, Y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

Automatically created module for IPython interactive environment
RandomizedSearchCV took 1.38 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.913 (std: 0.031)
Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 7, 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 6}

Model with rank: 1
Mean validation score: 0.913 (std: 0.031)
Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 8, 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 15}

Model with rank: 1
Mean validation score: 0.913 (std: 0.018)
Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 9, 'n_estimators': 16}

Model with rank: 1
Mean validation score: 0.913 (std: 0.035)
Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 7, 'min_samples_split': 5, 'n_estimators': 4}



In [51]:
#The best parameters from the Randomised Search applied to Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

X_train, Y_train = make_classification(n_samples=138 ,n_features=13, 
                                     n_informative=2, n_redundant=2,
                                     random_state=None, shuffle=True)

clf = RandomForestClassifier(max_depth=None, random_state=None)

clf.fit(X_train, Y_train)
RandomForestClassifier(bootstrap=False, max_depth=3 ,max_features=8, 
           min_samples_leaf=5, min_samples_split=3,n_estimators=15)

print(clf.feature_importances_)

print(clf.predict(X_test))

[ 0.02793621  0.03234475  0.0518954   0.04032561  0.04824939  0.04783691
  0.07329604  0.0452726   0.01597464  0.36136261  0.17729632  0.05189241
  0.02631713]
[0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 1 1 0 1 1 0 1 1 1 0 0 1 0 1 0 0 0 1
 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1
 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0]


In [52]:
Y_pred = clf.predict(X_test)
acc_log = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_log

50.0

In [53]:
#Recall performed on the Classifier

from sklearn.metrics import recall_score

y_true = Y_train
y_pred = clf.predict(X_test)
recall_score(y_true, y_pred, average=None)

array([ 0.60294118,  0.34285714])

In [54]:
acc_log = round(recall_score(y_true,y_pred) * 100, 2)
acc_log

34.289999999999999

In [55]:
#Applying the KKN Classifier to the data

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train) 
Y_pred = (knn.predict(X_test))
acc_knn = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_knn

57.25

In [56]:
#Recall performed on the Classifier

from sklearn.metrics import recall_score

y_true = Y_train
y_pred = knn.predict(X_test)
recall_score(y_true, y_pred, average=None)

array([ 0.42647059,  0.51428571])

In [57]:
acc_log = round(recall_score(y_true,y_pred) * 100, 2)
acc_log

51.43

In [58]:
# An 10 fold cross validation is performed to determine the ideal value for the n-neighbor.

from sklearn.model_selection import cross_val_score

#creating odd list for k for knn
myList = list(range(1,50))

#subsetting just the odd ones
neighbors = filter(lambda x: x % 2 != 0, myList)

#empty list that will hold cv scores
cv_scores = {}

#perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, Y_train, cv=10, scoring='accuracy')
    cv_scores[k] = scores.mean()

In [59]:
#The ideal n-neighbor determined

bess_k = max(cv_scores, key=lambda k: cv_scores[k])
bess_k

47

In [60]:
#The KNN Classifier performed with the ideal n-neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, Y_train) 
Y_pred = (knn.predict(X_test))
acc_knn = round(accuracy_score(Y_test,Y_pred) * 100, 2)
acc_knn

60.869999999999997

In [61]:
#Reacll performed on the classifier

from sklearn.metrics import recall_score

y_true = Y_train
y_pred = knn.predict(X_test)
recall_score(y_true, y_pred, average=None)

array([ 0.44117647,  0.54285714])

In [62]:
acc_log = round(recall_score(y_true,y_pred) * 100, 2)
acc_log

54.289999999999999