In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import RandomizedSearchCV

In [15]:
cvk=KFold(n_splits=5)
def read(cale):
    file = pd.read_csv(cale)
    df=file.values
    X=df[:,:-1]
    y=df[:,-1:].ravel()
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=1/3,shuffle=True,random_state=1)
    return X_trainval, X_test, y_trainval, y_test

In [42]:
def mlp_classifier(X_trainval, X_test, y_trainval, y_test):
    nn=MLPClassifier()
    nn.fit(X_trainval,y_trainval)
    pred=nn.predict(X_test)
    
    print("Accuracy:",accuracy_score(y_test,pred),"\nF1_score:",f1_score(y_test,pred, average='macro'))
    
    print("cvs train")
    scores = cross_val_score(nn, X_trainval, y_trainval, cv=cvk, scoring="accuracy")
    print(scores)
    print(scores.mean())
    
    print("cvs test")
    scores_test = cross_val_score(nn, X_test, y_test, cv=cvk, scoring="accuracy")
    print(scores_test)
    print(scores_test.mean())
    
    print("Grid Search")
    parameters = {'solver': ['lbfgs'], 
              'max_iter': [400, 500], 
              'alpha': [0.01, 0.0001, 1.0]
             }
    model = GridSearchCV(MLPClassifier(), param_grid=parameters, n_jobs = -1, cv=5, iid=False)
    model.fit(X_trainval, y_trainval)
    print("MLP best score: ", model.best_score_)
    print("MLP best parameters: ", model.best_params_)
    
    print("Random Search")
    parameters = {'solver': ['lbfgs'], 
              'max_iter': [400, 500], 
              'alpha': [0.01, 0.0001, 1.0]
             }
    model = RandomizedSearchCV(MLPClassifier(), param_distributions = parameters, cv=5)
    model.fit(X_trainval, y_trainval)
    
    print("MLP best score: ", model.best_score_)
    print("MLP best parameters: ", model.best_params_)
    
    """scores = cross_val_score(nn, X_trainval, y_trainval, cv=cvk, scoring="f1")
    print(scores)
    print(scores.mean())
    scores_test = cross_val_score(nn, X_test, y_test, cv=cvk, scoring="f1")
    print(scores_test)
    print(scores_test.mean())"""

In [50]:
def support_vector_machines(X_trainval, X_test, y_trainval, y_test):
    clf = svm.SVC(gamma='scale')
    clf.fit(X_trainval,y_trainval)
    pred2=clf.predict(X_test)
    print("Accuracy:",accuracy_score(y_test,pred2),"\nF1_score:",f1_score(y_test,pred2,average='macro'))
    
    print("cvs train")
    scores = cross_val_score(clf, X_trainval, y_trainval, cv=cvk, scoring="accuracy")
    print(scores)
    print(scores.mean())
    print("cvs test")
    scores_test = cross_val_score(clf, X_test, y_test, cv=cvk, scoring="accuracy")
    print(scores_test)
    print(scores_test.mean())
    
    print("Grid Search")
    parameters = {
            'C':[1,5, 10],
            'cache_size':[200, 1000],
            'max_iter':[100, 500, 1000]
    }
    model = GridSearchCV(svm.SVC(gamma='scale'), param_grid=parameters, n_jobs=-1, cv=5, iid=False)
    model.fit(X_trainval, y_trainval)
    print("SVC best score: ", model.best_score_)
    print("SVC best parameters",model.best_params_)
    
    print("Random search")
    parameters = {
            'C':[1,5, 10],
            'cache_size':[200, 1000],
            'max_iter':[100, 500, 1000]
    }
    model = RandomizedSearchCV(svm.SVC(gamma='scale'), param_distributions=parameters, cv=5)
    model.fit(X_trainval, y_trainval)
    print("SVC best score: ", model.best_score_)
    print("SVC best parameters",model.best_params_)
    
    '''scores = cross_val_score(clf, X_trainval, y_trainval, cv=cvk, scoring="f1")
    print(scores)
    print(scores.mean())
    scores_test = cross_val_score(clf, X_test, y_test, cv=cvk, scoring="f1")
    print(scores_test)
    print(scores_test.mean())'''

In [60]:
def decision_tree(X_trainval, X_test, y_trainval, y_test):
    dtc = tree.DecisionTreeClassifier()
    dtc.fit(X_trainval,y_trainval)
    pred3=dtc.predict(X_test)
    print("Accuracy:",accuracy_score(y_test,pred3),"\nF1_score:",f1_score(y_test,pred3,average='macro'))
    
    print("cvs train")
    scores = cross_val_score(dtc, X_trainval, y_trainval, cv=cvk, scoring="accuracy")
    print(scores)
    print(scores.mean())
    print("cvs test")
    scores_test = cross_val_score(dtc, X_test, y_test, cv=cvk, scoring="accuracy")
    print(scores_test)
    print(scores_test.mean())
    
    ''' scores = cross_val_score(dtc, X_trainval, y_trainval, cv=cvk, scoring="f1")
    print(scores)
    print(scores.mean())
    scores_test = cross_val_score(dtc, X_test, y_test, cv=cvk, scoring="f1")
    print(scores_test)
    print(scores_test.mean())'''
    
    print("Grid Search")
    parameters = {
        'min_samples_split':[2,5,8, 10],
        'splitter':['best', 'random'],
        'max_depth':[1, 10, 20, 30]
    }
    model = GridSearchCV(dtc, param_grid=parameters, n_jobs=-1, cv=5, iid=False)
    model.fit(X_trainval, y_trainval)
    print("Decision tree best score: " ,model.best_score_)
    print("Decision tree best parameters: ", model.best_params_)
    
    print("Random search")
    parameters = {
            'min_samples_split':[2,5,8, 10],
            'splitter':['best', 'random'],
            'max_depth':[1, 10, 20, 30]
    }
    model = RandomizedSearchCV(dtc, param_distributions=parameters, cv=5)
    model.fit(X_trainval, y_trainval)
    print("Decision tree best score: " ,model.best_score_)
    print("Decision tree best parameters: ", model.best_params_)

In [76]:
def random_forest_classifier(X_trainval, X_test, y_trainval, y_test):
    rfc = RandomForestClassifier(n_estimators=100)
    rfc = rfc.fit(X_trainval, y_trainval)
    pred_rfc=rfc.predict(X_test)
    print("Accuracy:",accuracy_score(y_test,pred_rfc),"\nF1_score:",f1_score(y_test,pred_rfc,average='macro'))
    
    print("cvs train")
    scores = cross_val_score(rfc, X_trainval, y_trainval, cv=cvk, scoring="accuracy")
    print(scores)
    print(scores.mean())
    print("cvs test")
    scores_test = cross_val_score(rfc, X_test, y_test, cv=cvk, scoring="accuracy")
    print(scores_test)
    print(scores_test.mean())
    
    print("Grid search")
    parameters = {
    'n_estimators'      : [320,330,340],
    'max_depth'         : [8, 9, 10, 11, 12],
    'random_state'      : [0]
    }
    model = GridSearchCV(rfc, param_grid=parameters, n_jobs=-1, cv=5, iid=False)
    model.fit(X_trainval, y_trainval)
    print("Random Forest score: " ,model.best_score_)
    print("Random Forest best parameters: ", model.best_params_)
    
    print("Random search")
    parameters = {
    'n_estimators'      : [320,330,340],
    'max_depth'         : [8, 9, 10, 11, 12],
    'random_state'      : [0]
    }
    model = RandomizedSearchCV(rfc,param_distributions=parameters, cv=5)
    model.fit(X_trainval, y_trainval)
    print("Random Forest score: " ,model.best_score_)
    print("Random Forest best parameters: ", model.best_params_)

    
    ''' scores = cross_val_score(rfc, X_trainval, y_trainval, cv=cvk, scoring="f1")
    print(scores)
    print(scores.mean())
    scores_test = cross_val_score(rfc, X_test, y_test, cv=cvk, scoring="f1")
    print(scores_test)
    print(scores_test.mean())'''

In [58]:
def kneighbors_classifier(X_trainval, X_test, y_trainval, y_test):
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(X_trainval, y_trainval)
    pred_neigh=neigh.predict(X_test)
    print("Accuracy:",accuracy_score(y_test,pred_neigh),"\nF1_score:",f1_score(y_test,pred_neigh,average='macro'))
    
    print("cvs train:")
    scores = cross_val_score(neigh, X_trainval, y_trainval, cv=cvk, scoring="accuracy")
    print(scores)
    print(scores.mean())
    print("cvs test:")
    scores_test = cross_val_score(neigh, X_test, y_test, cv=cvk, scoring="accuracy")
    print(scores_test)
    print(scores_test.mean())
    
    print("Grid Search")
    parameters = {'n_neighbors':[4,5,7],
              'leaf_size':[1,3,5],
              'algorithm':['auto', 'kd_tree']
             }

    model = GridSearchCV(KNeighborsClassifier(), param_grid=parameters, cv=5, n_jobs= -1, iid=False)
    model.fit(X_trainval, y_trainval)
    print("KNN best score: ", model.best_score_)
    print("KNN best parameters: ", model.best_params_)
    
    print("Random Search")
    parameters = {'n_neighbors':[4,5,7],
              'leaf_size':[1,3,5],
              'algorithm':['auto', 'kd_tree']
             }

    model = RandomizedSearchCV(KNeighborsClassifier(), param_distributions = parameters, cv=5)
    model.fit(X_trainval, y_trainval)
    print("KNN best score: ", model.best_score_)
    print("KNN best parameters: ", model.best_params_)
    
    '''  scores = cross_val_score(neigh, X_trainval, y_trainval, cv=cvk, scoring="f1")
    print(scores)
    print(scores.mean())
    scores_test = cross_val_score(neigh, X_test, y_test, cv=cvk, scoring="f1")
    print(scores_test)
    print(scores_test.mean())'''

Spambase

In [32]:
X_trainval, X_test, y_trainval, y_test = read('spambase.data')

In [35]:
mlp_classifier(X_trainval, X_test, y_trainval, y_test)

Accuracy: 0.9380704041720991 
F1_score: 0.9356585385883202
cvs train
[0.93322476 0.92659054 0.87438825 0.89396411 0.94453507]
0.9145405465723654
cvs test
[0.93811075 0.91856678 0.90553746 0.92508143 0.91176471]
0.9198122245640927
Grid Search
MLP best score:  0.9129186093547081
MLP best parameters:  {'alpha': 0.0001, 'max_iter': 500, 'solver': 'lbfgs'}
Random Search
MLP best score:  0.9099804305283757
MLP best parameters:  {'solver': 'lbfgs', 'max_iter': 500, 'alpha': 0.0001}


In [49]:
support_vector_machines(X_trainval, X_test, y_trainval, y_test)

Accuracy: 0.7698826597131682 
F1_score: 0.7513302519522684
[0.75407166 0.75856444 0.76998369 0.76998369 0.75693312]
0.7619073175656647
[0.76547231 0.77850163 0.71661238 0.73615635 0.66993464]
0.7333354623065296
Grid Search
SVC best score:  0.8545442932873476
SVC best parameters {'C': 10, 'cache_size': 200, 'max_iter': 1000}
Random search
SVC best score:  0.8545335942596216
SVC best parameters {'max_iter': 1000, 'cache_size': 1000, 'C': 10}


In [61]:
decision_tree(X_trainval, X_test, y_trainval, y_test)

Accuracy: 0.9256844850065189 
F1_score: 0.9225627155390004
cvs train
[0.90228013 0.92006525 0.89233279 0.89885808 0.91517129]
0.9057415072984363
cvs test
[0.88925081 0.89250814 0.87947883 0.8990228  0.89869281]
0.8917906793553468
Grid Search
Decision tree best score:  0.9148740712354432
Decision tree best parameters:  {'max_depth': 10, 'min_samples_split': 2, 'splitter': 'best'}
Random search
Decision tree best score:  0.910958904109589
Decision tree best parameters:  {'splitter': 'best', 'min_samples_split': 2, 'max_depth': 20}


In [77]:
random_forest_classifier(X_trainval, X_test, y_trainval, y_test)

Accuracy: 0.9550195567144719 
F1_score: 0.9528507760872906
cvs train
[0.94299674 0.94616639 0.93964111 0.95758564 0.95758564]
0.9487951070986392
cvs test
[0.93811075 0.95114007 0.91205212 0.95114007 0.94444444]
0.9393774882374231
Grid search
Random Forest score:  0.9442336760980179
Random Forest best parameters:  {'max_depth': 12, 'n_estimators': 340, 'random_state': 0}
Random search
Random Forest score:  0.9442270058708415
Random Forest best parameters:  {'random_state': 0, 'n_estimators': 340, 'max_depth': 12}


In [63]:
kneighbors_classifier(X_trainval, X_test, y_trainval, y_test)

Accuracy: 0.8089960886571056 
F1_score: 0.7995501873773284
cvs train:
[0.78664495 0.7634584  0.78466558 0.79445351 0.78466558]
0.7827776036048482
cvs test:
[0.7752443  0.73941368 0.79478827 0.80456026 0.74836601]
0.7724745055459752
Grid Search
KNN best score:  0.7834397819931447
KNN best parameters:  {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 5}
Random Search
KNN best score:  0.7834311806914547
KNN best parameters:  {'n_neighbors': 5, 'leaf_size': 3, 'algorithm': 'auto'}


Handwriting

In [78]:
data=pd.read_csv("semeion.data", sep=" ")
y_set = data.iloc[:, 256:266].values
x_set = data.iloc[:, :256].values
y_set = np.array([np.argmax(y_set[x]) for x in range(0,y_set.shape[0])])
X_train, X_test1, y_train, y_test1 = train_test_split(x_set, y_set, test_size=1/3,shuffle=True,random_state=1)

In [65]:
mlp_classifier(X_train, X_test1, y_train, y_test1)

Accuracy: 0.9152542372881356 
F1_score: 0.9161710399602336
cvs train
[0.9342723  0.91981132 0.90566038 0.89150943 0.94339623]
0.9189299317920099
cvs test
[0.90654206 0.90566038 0.87735849 0.88679245 0.91509434]
0.898289543290425
Grid Search
MLP best score:  0.9283172014700204
MLP best parameters:  {'alpha': 1.0, 'max_iter': 400, 'solver': 'lbfgs'}
Random Search
MLP best score:  0.9255419415645617
MLP best parameters:  {'solver': 'lbfgs', 'max_iter': 400, 'alpha': 1.0}


In [66]:
support_vector_machines(X_train, X_test1, y_train, y_test1)

Accuracy: 0.9378531073446328 
F1_score: 0.9393595225732776
cvs train
[0.94835681 0.93396226 0.93867925 0.91509434 0.91981132]
0.9311807954646115
cvs test
[0.85981308 0.93396226 0.90566038 0.89622642 0.85849057]
0.8908305413507318
Grid Search
SVC best score:  0.9404719375699786
SVC best parameters {'C': 10, 'cache_size': 200, 'max_iter': 100}
Random search
SVC best score:  0.94062205466541
SVC best parameters {'max_iter': 500, 'cache_size': 1000, 'C': 10}


In [67]:
decision_tree(X_train, X_test1, y_train, y_test1)

Accuracy: 0.7382297551789078 
F1_score: 0.7336210936393488
cvs train
[0.70892019 0.73113208 0.69339623 0.70283019 0.72641509]
0.7125387545398174
cvs test
[0.64485981 0.69811321 0.6509434  0.67924528 0.6509434 ]
0.6648210192205959
Grid Search
Decision tree best score:  0.7285142205081354
Decision tree best parameters:  {'max_depth': 10, 'min_samples_split': 2, 'splitter': 'random'}
Random search
Decision tree best score:  0.7238454288407163
Decision tree best parameters:  {'splitter': 'random', 'min_samples_split': 5, 'max_depth': 10}


In [79]:
random_forest_classifier(X_train, X_test1, y_train, y_test1)

Accuracy: 0.9322033898305084 
F1_score: 0.9338893996055025
cvs train
[0.92488263 0.92924528 0.9245283  0.9245283  0.91509434]
0.9236557711046152
cvs test
[0.87850467 0.91509434 0.89622642 0.90566038 0.83018868]
0.8851348968435901
Grid search
Random Forest score:  0.9321609106280224
Random Forest best parameters:  {'max_depth': 12, 'n_estimators': 320, 'random_state': 0}
Random search
Random Forest score:  0.9321394910461829
Random Forest best parameters:  {'random_state': 0, 'n_estimators': 320, 'max_depth': 12}


In [69]:
kneighbors_classifier(X_train, X_test1, y_train, y_test1)

Accuracy: 0.8907721280602636 
F1_score: 0.8908926869848213
cvs train:
[0.88262911 0.88679245 0.88207547 0.88207547 0.91509434]
0.8897333687660554
cvs test:
[0.81308411 0.89622642 0.83018868 0.83018868 0.80188679]
0.8343149356374537
Grid Search
KNN best score:  0.8926695531857464
KNN best parameters:  {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 5}
Random Search
KNN best score:  0.8925541941564562
KNN best parameters:  {'n_neighbors': 5, 'leaf_size': 1, 'algorithm': 'auto'}


Smartphone

In [80]:
x_train_smartphone = pd.read_csv('final_X_train.txt', header = None)
x_test_smartphone = pd.read_csv('final_X_test.txt', header = None)
y_train_smartphone = pd.read_csv('final_y_train.txt', header = None)
y_test_smartphone = pd.read_csv('final_y_test.txt', header = None)

y_train_smartphone = np.ravel(y_train_smartphone)
y_test_smartphone = np.ravel(y_test_smartphone)


In [None]:
mlp_classifier(x_train_smartphone, x_test_smartphone, y_train_smartphone, y_test_smartphone)

In [None]:
support_vector_machines(x_train_smartphone, x_test_smartphone, y_train_smartphone, y_test_smartphone)

In [None]:
decision_tree(x_train_smartphone, x_test_smartphone, y_train_smartphone, y_test_smartphone)

In [None]:
random_forest_classifier(x_train_smartphone, x_test_smartphone, y_train_smartphone, y_test_smartphone)

In [None]:
kneighbors_classifier(x_train_smartphone, x_test_smartphone, y_train_smartphone, y_test_smartphone)

WIRELESS

In [7]:
data_set = pd.read_csv('wifi_localization.txt', delimiter = '\t', header = None)
data_set=data_set.values
X=data_set[:,:-1]
y=data_set[:,-1:].ravel()
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=1/3,shuffle=True,random_state=1)

In [17]:
mlp_classifier(X_trainval, X_test, y_trainval, y_test)

Accuracy: 0.974512743628186 
F1_score: 0.974490236855597
[0.97752809 0.98876404 0.95505618 0.98120301 0.9887218 ]
0.9782546253273633
[0.97014925 0.96268657 0.98496241 0.97744361 0.96992481]
0.9730333295926383


In [None]:
support_vector_machines(X_trainval, X_test, y_trainval, y_test)

In [None]:
random_forest_classifier(X_trainval, X_test, y_trainval, y_test)

In [None]:
decision_tree(X_trainval, X_test, y_trainval, y_test)

In [None]:
kneighbors_classifier(X_trainval, X_test, y_trainval, y_test)

WINE

In [16]:
wine = pd.read_csv("winequality-white.csv", delimiter=";")
wine.head()
y_set = wine.iloc[:, 11:]
x_set = wine.iloc[:, :11]
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(x_set, y_set, test_size = 1/3)

y_train_wine = np.ravel(y_train_wine)
y_test_wine = np.ravel(y_test_wine)

In [None]:
mlp_classifier(X_train_wine, X_test_wine, y_train_wine, y_test_wine)

In [None]:
support_vector_machines(X_train_wine, X_test_wine, y_train_wine, y_test_wine)

In [None]:
random_forest_classifier(X_train_wine, X_test_wine, y_train_wine, y_test_wine)

In [None]:
decision_tree(X_train_wine, X_test_wine, y_train_wine, y_test_wine)

In [None]:
kneighbors_classifier(X_train_wine, X_test_wine, y_train_wine, y_test_wine)