In [4]:
#load all packages
import operator
import csv
import random
import math
import operator
import pandas as pd
import numpy as np
from itertools import chain
from operator import itemgetter
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score 
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")


In [18]:
#load datasets
def loadDataset (trainfile):
    fileData = open(trainfile, "r",encoding='utf-8-sig')
    lines = fileData.readlines()
    data = []
    for line in lines:
        x = line.split()
        data.append(x)
    
    df = pd.DataFrame(data,index = None)
    df = df.apply(pd.to_numeric)
    df1 = df.apply(lambda x: pd.Series(x.dropna().values))

    trainingSet = df1.values.tolist()
    
    return trainingSet

trainingSet = loadDataset('FGLASS.DAT')
norm_train_ = pd.DataFrame(trainingSet)

# normalizing data
def preprocessing(filename):
    trainingSet = loadDataset(filename)
    df = pd.DataFrame(trainingSet, columns = ['A','B','C','D','E','F','G','H','I','type'])
    return df

# calculate mean and variance for the dataframe
def train_mean_std(data):
    mean = data.mean ()
    std = data.std()
    return mean,std

# normalize the dataset
def normalize (data,mean,std):
    normalized_data = (data - mean)/std
    d = data[['type']].copy()
    cls = d['type']
    normalized_data ['type'] = cls
    return normalized_data

# save it to a text file for further use
def save_df (df,filename,fmt):
    col = len (df.columns)
    X = df.iloc[:,0:col].values
    Y = np.matrix(X)    
    np.savetxt(filename,Y,fmt)
    
preprocessed_train = preprocessing ('FGLASS.DAT')
mean,std = train_mean_std (preprocessed_train)
norm_train = normalize (preprocessed_train,mean,std)
# print (norm_train)

save_df (norm_train,'fglass.txt','%.6f')


## kNN

In [None]:
#calculate Euclidean distances 
def euclidean_distance(test_sample, train_set, length):
    d = 0
    for i in range(length):
        d += np.square(test_sample[i] - train_set[i])
    return np.sqrt(d)

#select neighbors based on calculation
def neighbors(training_set, test_sample, K):
    d_set = []
    n = len(test_sample)-1
    for x in range(len(training_set)):
        distance = euclidean_distance(test_sample, training_set[x], n)
        d_set.append((training_set[x], distance))
    d = sorted(d_set, key=itemgetter(1))
    neighbor_set = []
    for k in range(K):
        neighbor_set.append(d[k][0])
    return neighbor_set

# get most frequent element from a list
def most_frquent (List):
    return max(set(List),key=List.count)
    
# classify test data
def classification(neighbors):
    classs = []
    for neigh in neighbors:
        vote = neigh[len(neigh)-1]
        classs.append(vote)
    return most_frquent(classs)

#run KNN altogether for predictions
def kNN(training_set,test_set,K):
    prediction=[]
    for x in range(len(test_set)):
        neighbor_data = neighbors(training_set, test_set[x], K)
        result = classification(neighbor_data)
        prediction.append(result)
    return prediction


# calculate accuracy
def kNN_accuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

# calculate accuracy for diffrent K
def vary_k(trainingSet,testSet):
    K = [x for x in range (1,16)]
    K.append(int(np.sqrt(len(trainingSet))))
    K = (np.unique(np.array(K))).tolist()
    accuracy = []
    for k in K:
        predictions = kNN(trainingSet,testSet,k)
        acc = kNN_accuracy (testSet, predictions)    
        accuracy.append(acc)
    return K, accuracy

# vary_k (trainingSet,testSet,1)

# flatten list
def flatten(l): 
    return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l]


In [19]:
# K-fold validation on the training set
# split the training set into train and test based on given index
def indexwise_train_test_split (filename):
    fileData = open(filename, "r",encoding='utf-8-sig')
    lines = fileData.readlines()
    index = []
    for line in lines:
        x = line.split()
        y = [int(y)-1 for y in x]
        index.append(y)
    index_ = []
    for i in range(len(index)):
        y1 = [index[i]]
        x1 = list(chain(*[j for j in index if j not in y1]))
        index_.append([x1,y1[0]])
        
    return index_

In [20]:
#get mean accuracy of all K-folds for diffrent k in kNN
def cross_validation_score(train,n_folds):
    col = len (train.columns)
    X = train.iloc[:,0:col].values
    
    acc = []
    max_acc = []
    mean_k = []
    
    index = indexwise_train_test_split ('FGLASS.GRP')

    for train_index, test_index in tqdm_notebook(index):
        x = X[train_index]
        y = X[test_index]
        K, ac = vary_k(x,y)
        acc.append(ac) 
    mean_all_k = []
    std_acc = []
    for k in range(len(K)):
        summ = 0
        acc_k = []
        for i in range (n_folds):
            summ+=acc[i][k]
            acc_k.append(acc[i][k])
        max_acc.append(max(acc_k))
        std_acc.append((np.array(acc_k)).std())
        mean_all_k.append(summ/n_folds)

    return mean_all_k,std_acc,K,max_acc


accuracy,std_acc,K,max_acc = cross_validation_score(norm_train,10)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [17]:
#plot validation accuracy with classification accuracy
def kNN_validation_plot(accuracy,std_acc,max_acc,K):
    plt.figure(figsize=(8,7))
    plt.plot(K, max_acc,'ro-',lw = 1,label = "best accuracy for each k")
    plt.errorbar(K, accuracy, np.array(std_acc),
                 color = 'b',marker = 'o',lw = 1,label = 'cross-validation accuracy for k = [1,15]')
    plt.errorbar(K[13], accuracy[13],np.array(std_acc[13]),
                 color = 'darkorange',marker = '*',markersize = 10, lw = 1,label = 'cross-validation accuracy for k = sqrt(n)')     
    plt.xlabel('k values')
    plt.ylabel('Accuracy')
    plt.legend (loc = 'best')
    plt.title('Performance Curve for Different k values')
    plt.show()
    
# kNN_validation_plot(accuracy,std_acc,max_acc,K)

In [15]:
# # check kNN result with scikit learn
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score
# def KNN_scikit(train,n_fold):
#     col = len (train.columns)
#     X = train.iloc[:,0:col].values

#     index = indexwise_train_test_split ('FGLASS.GRP')
#     mean_k = []
#     K = [x for x in range (1,15)]
#     K.append(int(np.sqrt(len(train))))

#     for k in range (1,len(K)+1):
#         accuracy = 0
#         for train_index, test_index in index:

#             x = X[train_index]
#             X_train  = x[:,:col-1]
#             y_train = x[:,col-1]

#             x = X[test_index]
#             X_test  = x[:,:col-1]
#             y_test = x[:,col-1]

#             clf = KNeighborsClassifier(n_neighbors = k, weights='uniform', algorithm='auto')
#             clf.fit(X_train, y_train) 

#             y_pred = clf.predict(X_test)
#             accuracy += accuracy_score(y_test,y_pred)*100
            
#         avg = accuracy/n_fold
#         print ("Accuracy is ",avg ,"%")
#         mean_k.append(avg)
#     return mean_k

# KNN_scikit(norm_train,10)

## decision tree

In [16]:
# implement DT using sk-learn
def decision_tree(train):
    col = len (train.columns)
    X = train.iloc[:,0:col].values
    accuracy = []
    index = indexwise_train_test_split ('FGLASS.GRP')
    for train_index, test_index in index:
        
        train = X[train_index]
        X_train  = train[:,:col-1]
        y_train = train[:,col-1]

        test = X[test_index]
        X_test  = test[:,:col-1]
        y_test = test[:,col-1]
        
        clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                          max_depth=None, min_samples_split=8, 
                                          min_samples_leaf=4, min_weight_fraction_leaf=0.0, 
                                          max_features=None, random_state=0, 
                                          max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                          min_impurity_split=None, class_weight=None, presort=True)
        clf.fit(X_train, y_train) 
        
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test,y_pred)*100
        accuracy.append(acc)
#         print ("Accuracy is ", acc,"%\n")
    return (sum(accuracy)/len(accuracy)),accuracy

# avg,acc = decision_tree(norm_train_)
# print ('Validation Accuracy: ',avg)

#better accuracy for min_samples_leaf = 2 than 1

In [17]:
# plot accuracy
def DT_accuracy_plot(accuracy,folds):
    plt.figure(figsize=(8,7))
    plt.plot(folds, accuracy,'bo-',lw = 1)
    plt.xlabel('Folds')
    plt.ylabel('Accuracy')
    plt.title('Performance Curve for Different Folds')
    plt.show()

folds = []
for i in list(range(1,11)):
    folds.append('Fold'+str(i))

# DT_accuracy_plot(acc,folds)

## 3 layer NN

In [73]:
def indexwise_train_test_val_split (filename,n):
    fileData = open(filename, "r",encoding='utf-8-sig')
    lines = fileData.readlines()
    index = []
    for line in lines:
        x = line.split()
        y = [int(y)-1 for y in x]
        index.append(y)
    test = [index[n]]
    val = [index[n-1]]
    test_val = test+val
    train = list(chain(*[j for j in index if j not in test_val]))
    index_ = [train,test[0],val[0]]
    return index_



In [None]:
# implement MLP
def NN(train,solver,activation_function):
    n_folds = 10
    n_nodes = list (range (2,16))
    col = len (train.columns)
    X = train.iloc[:,0:col].values

    accuracy_test = []
    accuracy_val = []
    
    plt.figure(figsize=(8,7))
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('solver: '+solver+' & '+'activation_function: '+activation_function)
    
    for n in n_nodes:
        print ('Number of Nodes: ',n)
        test_acc = 0.0
        val_acc = 0.0
        clf_ = []
        for i in list(range(0,n_folds)):
            index = indexwise_train_test_val_split ('FGLASS.GRP',i)
            train_ind = index[0]
            val_ind = index[2]
            test_ind = index[1]
            train_val_ind = flatten([train_ind,val_ind])

            train = X[train_val_ind]
            X_train  = train[:,:col-1]
            y_train = train[:,col-1]

            test = X[test_ind]
            X_test  = test[:,:col-1]
            y_test = test[:,col-1]

            val = X[val_ind]
            X_val  = val[:,:col-1]
            y_val = val[:,col-1]
            clf = MLPClassifier(solver=solver,activation=activation_function,
                                hidden_layer_sizes=(n,),shuffle= False,
                                random_state = 0,validation_fraction = .1,max_iter = 1000) 
            clf = clf.fit(X_train, y_train)
            clf_.append(clf)
            y_pred_test = clf.predict (X_test)
            test_acc += accuracy_score(y_test,y_pred_test)*100
            

            y_pred_val = clf.predict (X_val)
            val_acc += accuracy_score (y_val,y_pred_val)*100
        plt.plot(clf_[0].loss_curve_,label = n)
        plt.legend(loc = 'best')
        print ('Testing Accuracy: ',test_acc/n_folds,'%')
        print ('Validation Accuracy: ',val_acc/n_folds,'%')
        accuracy_test.append(test_acc/n_folds)
        accuracy_val.append(val_acc/n_folds)
    return accuracy_test,accuracy_val,n_nodes

preprocessed_train = preprocessing ('FGLASS.DAT')
mean,std = train_mean_std (preprocessed_train)
norm_train = normalize (preprocessed_train,mean,std)


In [None]:
def NN_accuracy_plot(accuracy_test,accuracy_val,n_nodes):
    plt.figure(figsize=(8,7))
    line1 = plt.plot(n_nodes, accuracy_val,'bo-',lw = 1,label = "validation accuracy")
    line2 = plt.plot(n_nodes, accuracy_test,'ro-',lw = 1,label = "testing accuracy")
    plt.xlabel('Number of Hidden Nodes')
    plt.ylabel('Accuracy')
    plt.legend (loc = 'best')
    plt.title('Performance Curve for Different Number of Nodes')
    plt.show()
    

In [21]:
# accuracy_test,accuracy_val,n_nodes = NN(norm_train,'adam','relu')
# NN_accuracy_plot(accuracy_test,accuracy_val,n_nodes)

In [20]:
# accuracy_test,accuracy_val,n_nodes = NN(norm_train,'adam','logistic')
# NN_accuracy_plot(accuracy_test,accuracy_val,n_nodes)

In [19]:
# accuracy_test,accuracy_val,n_nodes = NN(norm_train,'sgd','relu')
# NN_accuracy_plot(accuracy_test,accuracy_val,n_nodes)

In [18]:
# accuracy_test,accuracy_val,n_nodes = NN(norm_train,'sgd','logistic')
# NN_accuracy_plot(accuracy_test,accuracy_val,n_nodes)

### arbitrary m-fold train-validation set split (by not using index)

In [None]:
# split train test data arbitrarily
def arbitrary_train_test_split(n_split, x_len):
    index = []
    test_size = int(x_len/n_split)
    train_size = x_len - test_size
    for i in range(n_split):
        j = i*test_size
        index.append([list(set(list(range(0,x_len))).difference(set(list(range(j,j+test_size)))) ),
                     list(range(j,j+test_size))])
    return index

#get mean accuracy of all K-folds for diffrent k in KNN
def cross_validation_score(trainingSet, n_split,prior0):
    train = pd.DataFrame(trainingSet)
    col = len (train.columns)
    X = train.iloc[:,0:col].values
    index = arbitrary_train_test_split(n_split, len(X))
    acc = []
    mean_k = []
    
    for train_index, test_index in index:
        x = X[train_index]
        y = X[test_index]
        K, ac = vary_k(x,y,prior0)
        acc.append(ac) 
    for k in range(len(K)):
        avg = (acc[0][k]+acc[1][k]+acc[2][k]+acc[3][k]+acc[4][k])/5
        mean_k.append(avg)

    return mean_k
