In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint
from skater.model import InMemoryModel
from skater.core.explanations import Interpretation

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.cross_validation import KFold, cross_val_score



In [5]:
columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
#adding column names to make the data more readable

In [23]:
df = pd.read_csv('data//data.csv', sep=',', names=columns, header=None, na_values='?') #settingthe NA values as '?'
#df = pd.read_csv('data.csv', sep=',', header=None, na_values='?')
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [24]:
df = df.sample(frac=1) #shuffling the dataset
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
86,b,,0.375,u,g,d,v,0.875,t,f,0,t,s,928.0,0,-
15,b,36.67,4.415,y,p,k,v,0.25,t,t,10,t,g,320.0,0,+
25,a,15.83,0.585,u,g,c,h,1.5,t,t,2,f,g,100.0,0,+
316,b,21.17,0.25,y,p,c,h,0.25,f,f,0,f,g,280.0,204,-
227,a,22.5,8.46,y,p,x,v,2.46,f,f,0,f,g,164.0,0,+


In [25]:
df.dtypes
df.info() #trying to determine which columns have missing data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 86 to 369
Data columns (total 16 columns):
A1     678 non-null object
A2     678 non-null float64
A3     690 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     681 non-null object
A7     681 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    677 non-null float64
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 91.6+ KB


In [26]:
numerics = df.dtypes[df.dtypes != "object"].index
#fill the missing numerics with the mean value in all columns
for num in numerics:
     df[num] = df[num].fillna(df[num].mean())

#fill the missing categorical value with the mode value
categorical = df.dtypes[df.dtypes == "object"].index
for cat in categorical:
     df[cat] = df[cat].fillna(df[cat].mode()[0])    

In [11]:
#encode all categorical data using label encoding
categorical
for ct in categorical:
    encoder = LabelEncoder()
    encoder.fit(df[ct])
    df[ct] = encoder.transform(df[ct])

In [12]:
df_train, df_test = train_test_split(df, test_size = 0.4) #split the dataset into training and test sets
df_train_copy = df_train
df_test_copy = df_test

In [13]:
X_train = df_train_copy.drop("A16", axis=1)
Y_train = df_train_copy["A16"]
X_test  = df_test_copy.drop("A16", axis=1)
Y_test = df_test_copy["A16"]
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((414, 15), (414,), (276, 15), (276,))

In [14]:
X_train_copy = X_train
X_test_copy = X_test
Y_train_copy = Y_train
Y_test_copy = Y_test

In [15]:
scaler = StandardScaler()
scaler.fit(X_train_copy)
X_train_copy = scaler.transform(X_train_copy)
pca = PCA(0.95)
pca.fit(X_train_copy)
pca.n_components_
X_train_copy = pca.transform(X_train_copy)
X_train_copy = pd.DataFrame(X_train_copy)
X_train_copy.shape

(414, 13)

In [16]:
scaler2 = StandardScaler()
scaler2.fit(X_test_copy)
X_test_copy = scaler2.transform(X_test_copy)
pca = PCA(0.95)
pca.fit(X_test_copy)
pca.n_components_
X_test_copy = pca.transform(X_test_copy)
X_test_copy = pd.DataFrame(X_test_copy)
X_test_copy.shape

(276, 13)

In [17]:
#Random Forest Classifier: Part 1 (using non-scaled data)
rf = RandomForestClassifier(n_estimators=10, criterion='gini', 
                                       max_depth=None, min_samples_split=2, 
                                       min_samples_leaf=1, 
                                       min_weight_fraction_leaf=0.0, 
                                       max_features='auto', 
                                       max_leaf_nodes=None, 
                                       min_impurity_decrease=0.0, 
                                       min_impurity_split=None, 
                                       bootstrap=True, oob_score=False, 
                                       n_jobs=1, random_state=None, 
                                       verbose=0, warm_start=False, 
                                       class_weight=None)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
rf.score(X_train, Y_train)
print('Recall Score:', recall_score(Y_test,Y_pred)*100)

Recall Score: 82.46753246753246


In [18]:
#RandomizedSearchCV: Part 2
param_dist = {"n_estimators":randint(1,20),
              "max_depth": randint(1, 10),
              "max_features": randint(1, 10),
              "min_samples_split": randint(2,10),
              "min_samples_leaf": randint(1, 10),
              "bootstrap": ["True", "False"]}


tree = RandomForestClassifier()
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)
tree_cv.fit(X_train,Y_train)

print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_  * 100))

Tuned Decision Tree Parameters: {'bootstrap': 'True', 'max_depth': 8, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 14}
Best score is 86.47342995169082


The above tuning shows that the printed parameter and corresponding variables will yield a higher recall score.

In [19]:
#Question 3
#Part 1: using the scaled attributes
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_copy, Y_train_copy)
Y_pred = knn.predict(X_test_copy)
# acc_knn = round(recall_score(Y_test,Y_pred2) * 100, 2)
# acc_knn
print('Recall Score:', recall_score(Y_test_copy,Y_pred)*100)

Recall Score: 74.02597402597402


In [20]:
#Part 2
# creating odd list of K for KNN
neighbors = filter(lambda x: x % 2 != 0, list(range(1,20)))
# empty list that will hold cv scores
cv_scores = {}
neighbors


# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_copy, Y_train_copy, cv=10)
    cv_scores[k] = scores.mean()
    print(k, cv_scores[k])

1 0.770473286875726
3 0.8431939605110337
5 0.8624157955865274
7 0.8527758420441348
9 0.860095818815331
11 0.8671747967479675
13 0.8598606271777003
15 0.8455168408826946
17 0.8381968641114984
19 0.8477787456445993


In [21]:
best_val_k = max(cv_scores, key=lambda k: cv_scores[k])
best_val_k

11

In [22]:
knn = KNeighborsClassifier(n_neighbors = best_val_k)
knn.fit(X_train_copy, Y_train_copy)
Y_pred = knn.predict(X_test_copy)
# acc_knn = round(recall_score(Y_test,Y_pred2) * 100, 2)
# acc_knn
print('Recall Score:', recall_score(Y_test_copy,Y_pred)*100)

Recall Score: 81.81818181818183
