In [1]:
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from statistics import mode

In [2]:
import warnings
warnings.filterwarnings("ignore")

# COV_TYPE

In [2]:
data = pd.read_csv('covtype.data',header=-1)
data = shuffle(data)
print(data.shape)

(581012, 55)


In [3]:
X = data.iloc[:6250,:54].values
Y = data.iloc[:6250,54].values
boolean = mode(Y)==Y
for i in range(boolean.size):
    if boolean[i] == True:
        Y[i]= 1
    else:
        Y[i]=-1
print(X.shape)

(6250, 54)


## 80/20 Split

In [4]:
X_train = X[:int(len(X)*.8),:]
Y_train = Y[:int(len(Y)*.8)]
X_test = X[int(len(X)*.8):,:]
Y_test = Y[int(len(Y)*.8):]

### SVM

In [7]:
model = svm.LinearSVC()
C_list = [10**-5,10**-4,10**-3,10**-2,10**-1]
clf = GridSearchCV(model,{'C':C_list},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.62184898 0.52790328 0.51189909 0.51520281 0.51370033] [0.6176 0.5304 0.5086 0.518  0.515 ]


In [12]:
clf = svm.LinearSVC(C=10**-5)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
svm_test_acc_80 = (Y_test.size - errors)/Y_test.size
print(svm_test_acc_80)

0.624


### KNN

In [13]:
model = KNeighborsClassifier()
k = np.arange(1,27)
clf = GridSearchCV(model,{'n_neighbors':k},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[1.         0.89329984 0.89030021 0.85440001 0.84830006 0.82849985
 0.82359985 0.80994944 0.81034968 0.8007492  0.79860008 0.79189993
 0.79044973 0.78734963 0.78249975 0.77834958 0.77520026 0.77224945
 0.77035    0.76675003 0.76305003 0.76240014 0.75859991 0.7576999
 0.75684976 0.7554993 ] [0.7764 0.7452 0.7662 0.7484 0.7588 0.7512 0.7504 0.7464 0.7552 0.746
 0.7522 0.7462 0.7494 0.741  0.7388 0.7368 0.7402 0.7376 0.7342 0.7344
 0.7328 0.7326 0.7298 0.7308 0.7302 0.7298]


In [16]:
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
knn_test_acc_80 = (Y_test.size - errors)/Y_test.size
print(knn_test_acc_80)

0.784


### Decision Trees

In [40]:
model = tree.DecisionTreeClassifier()
D = [5,10,15,20,25]
clf = GridSearchCV(model,{'max_depth':D},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.77656 0.87472 0.9468  0.98592 0.99752] [0.73664 0.75552 0.74368 0.72896 0.72448]


In [20]:
clf = tree.DecisionTreeClassifier(max_depth=10)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
dt_test_acc_80 = (Y_test.size - errors)/Y_test.size
print(dt_test_acc_80)

0.7752


### Random Forest

In [22]:
model = RandomForestClassifier(n_estimators=1024)
clf = GridSearchCV(model,{'min_samples_split':[2,4,6,8,12,16,20]},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[1.         0.99895002 0.99015017 0.97695017 0.95069982 0.9297997
 0.9111995 ] [0.8132 0.81   0.8082 0.8082 0.8032 0.8036 0.7982]


In [25]:
clf =  RandomForestClassifier(n_estimators=1024, min_samples_split=2)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
rf_test_acc_80 = (Y_test.size - errors)/Y_test.size
print(rf_test_acc_80)

0.816


### Neural Network

In [5]:
model = MLPClassifier(solver='sgd')
param = {'hidden_layer_sizes':[(1,),(10,),(100,)],'momentum':[0,0.2,0.5,0.9]}
clf = GridSearchCV(model,param,cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)



[0.49690313 0.49689687 0.49684844 0.50309687 0.4971     0.50024761
 0.50944841 0.51004861 0.55450399 0.52730499 0.52740636 0.50765019] [0.4968 0.4968 0.497  0.5032 0.4968 0.5048 0.5094 0.5108 0.553  0.526
 0.5272 0.508 ]




In [29]:
clf = MLPClassifier(solver='sgd',hidden_layer_sizes=(100,),momentum=0)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
mlp_test_acc_80 = (Y_test.size - errors)/Y_test.size
print(mlp_test_acc_80)

0.56


## 50/50 Split

In [30]:
# Split 50/50
X_train = X[:int(len(X)*.5),:]
Y_train = Y[:int(len(Y)*.5)]
X_test = X[int(len(X)*.5):,:]
Y_test = Y[int(len(Y)*.5):]

### SVM

In [31]:
model = svm.LinearSVC()
C_list = [10**-5,10**-4,10**-3,10**-2,10**-1]
clf = GridSearchCV(model,{'C':C_list},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.60568 0.52536 0.5172  0.49712 0.53664] [0.59712 0.53248 0.51232 0.49824 0.53088]


In [36]:
clf = svm.LinearSVC(C=10**-5)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
svm_test_acc_50 = (Y_test.size - errors)/Y_test.size
print(svm_test_acc_50)

0.58848


### KNN

In [37]:
model = KNeighborsClassifier()
k = np.arange(1,27)
clf = GridSearchCV(model,{'n_neighbors':k},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[1.      0.87696 0.87264 0.83512 0.83472 0.81456 0.81672 0.79736 0.79232
 0.78552 0.7796  0.77352 0.77008 0.76448 0.76328 0.75816 0.7556  0.75488
 0.75184 0.74968 0.74824 0.7456  0.7408  0.73928 0.73696 0.73424] [0.75712 0.7152  0.73952 0.72736 0.7392  0.73024 0.73984 0.7328  0.73536
 0.72416 0.72992 0.72224 0.71936 0.72128 0.71648 0.72096 0.72128 0.7168
 0.7152  0.70976 0.70688 0.7056  0.70048 0.69504 0.696   0.68992]


In [39]:
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
knn_test_acc_50 = (Y_test.size - errors)/Y_test.size
print(knn_test_acc_50)

0.76064


### Decision Trees

In [41]:
model = tree.DecisionTreeClassifier()
D = [5,10,15,20,25]
clf = GridSearchCV(model,{'max_depth':D},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.77656 0.87456 0.94712 0.98536 0.99784] [0.73728 0.75328 0.73888 0.72736 0.72096]


In [44]:
clf = tree.DecisionTreeClassifier(max_depth=10)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
dt_test_acc_50 = (Y_test.size - errors)/Y_test.size
print(dt_test_acc_50)

0.75808


### Random Forest

In [45]:
model = RandomForestClassifier(n_estimators=1024)
clf = GridSearchCV(model,{'min_samples_split':[2,4,6,8,12,16,20]},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[1.      0.99848 0.98768 0.9772  0.95408 0.9308  0.91192] [0.80672 0.80576 0.80288 0.8     0.79968 0.79712 0.79392]


In [48]:
clf =  RandomForestClassifier(n_estimators=1024, min_samples_split=2)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
rf_test_acc_50 = (Y_test.size - errors)/Y_test.size
print(rf_test_acc_50)

0.80928


### Neural Network

In [49]:
model = MLPClassifier(solver='sgd')
param = {'hidden_layer_sizes':[(1,),(10,),(100,)],'momentum':[0,0.2,0.5,0.9]}
clf = GridSearchCV(model,param,cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.50816 0.5136  0.50272 0.4864  0.50504 0.4984  0.50816 0.50272 0.54712
 0.5336  0.5376  0.52128] [0.50816 0.5136  0.50272 0.4864  0.5072  0.49824 0.50816 0.50272 0.5376
 0.52672 0.53472 0.52032]


In [74]:
clf = MLPClassifier(solver='sgd',hidden_layer_sizes=(100,),momentum=0)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
mlp_test_acc_50 = (Y_test.size - errors)/Y_test.size
print(mlp_test_acc_50)

0.5192


## 20/80 Split

In [53]:
# Split 20/80
X_train = X[:int(len(X)*.2),:]
Y_train = Y[:int(len(Y)*.2)]
X_test = X[int(len(X)*.2):,:]
Y_test = Y[int(len(Y)*.2):]

### SVM

In [54]:
model = svm.LinearSVC()
C_list = [10**-5,10**-4,10**-3,10**-2,10**-1]
clf = GridSearchCV(model,{'C':C_list},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.5912 0.521  0.5184 0.5214 0.5286] [0.5824 0.5176 0.5088 0.5224 0.5336]


In [57]:
clf = svm.LinearSVC(C=10**-5)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
svm_test_acc_20 = (Y_test.size - errors)/Y_test.size
print(svm_test_acc_20)

0.6044


### KNN

In [58]:
model = KNeighborsClassifier()
k = np.arange(1,27)
clf = GridSearchCV(model,{'n_neighbors':k},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[1.     0.8584 0.848  0.8116 0.7998 0.7746 0.7774 0.7616 0.76   0.7468
 0.7458 0.7428 0.7362 0.7352 0.7342 0.729  0.7302 0.7248 0.7242 0.7138
 0.7138 0.7086 0.7084 0.7034 0.703  0.6978] [0.7008 0.676  0.6784 0.6728 0.6832 0.6776 0.6696 0.6672 0.6792 0.6784
 0.688  0.6776 0.6712 0.6744 0.6816 0.6752 0.6744 0.6688 0.6688 0.6736
 0.6768 0.6832 0.6824 0.6776 0.6832 0.6744]


In [60]:
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
knn_test_acc_20 = (Y_test.size - errors)/Y_test.size
print(knn_test_acc_20)

0.7098


### Decision Trees

In [61]:
model = tree.DecisionTreeClassifier()
D = [5,10,15,20,25]
clf = GridSearchCV(model,{'max_depth':D},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.8004 0.904  0.964  0.993  1.    ] [0.7504 0.74   0.7256 0.7168 0.712 ]


In [66]:
clf = tree.DecisionTreeClassifier(max_depth=5)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
dt_test_acc_20 = (Y_test.size - errors)/Y_test.size
print(dt_test_acc_20)

0.7474


### Random Forest

In [67]:
model = RandomForestClassifier(n_estimators=1024)
clf = GridSearchCV(model,{'min_samples_split':[2,4,6,8,12,16,20]},cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[1.     0.9974 0.9876 0.978  0.9538 0.9264 0.9098] [0.78   0.7848 0.7776 0.78   0.772  0.7744 0.7656]


In [70]:
clf =  RandomForestClassifier(n_estimators=1024, min_samples_split=2)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
rf_test_acc_20 = (Y_test.size - errors)/Y_test.size
print(rf_test_acc_20)

0.7784


### Neural Network

In [71]:
model = MLPClassifier(solver='sgd')
param = {'hidden_layer_sizes':[(1,),(10,),(100,)],'momentum':[0,0.2,0.5,0.9]}
clf = GridSearchCV(model,param,cv=5)
clf.fit(X_train, Y_train)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print(train_acc, val_acc)

[0.4934 0.5072 0.512  0.5024 0.4902 0.4934 0.5032 0.5024 0.5404 0.5232
 0.5232 0.5202] [0.492  0.5072 0.512  0.5024 0.4896 0.4928 0.504  0.5024 0.5216 0.5088
 0.5112 0.5064]


In [78]:
clf = MLPClassifier(solver='sgd',hidden_layer_sizes=(100,),momentum=0)
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)
errors = np.count_nonzero(pred-Y_test)
mlp_test_acc_20 = (Y_test.size - errors)/Y_test.size
print(mlp_test_acc_20)

0.5686
