In [None]:
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

from collections import Counter
from collections import OrderedDict

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from keras.utils.vis_utils import plot_model
from tensorflow.keras.layers import Dense


In [None]:
SAMPLESIZE = 500
TESTSIZE = 0.90
VALSIZE = 0.70

## Programming with Supervised Learning

### Generating Data One 

In [None]:
X_small, y_small = make_circles(n_samples=(int(SAMPLESIZE/2),int(SAMPLESIZE/2)), random_state=3, 
noise=0.04, factor = 0.3)
X_large, y_large = make_circles(n_samples=(int(SAMPLESIZE/2),int(SAMPLESIZE/2)), random_state=3, 
noise=0.04, factor = 0.7)
y_large[y_large==1] = 2

df = pd.DataFrame(np.vstack([X_small,X_large]),columns=['x1','x2'])
df['labels'] = np.hstack([y_small,y_large])
dfOne = shuffle(df)
trainOne, valTestOne = train_test_split(dfOne, test_size=TESTSIZE)
valone, testOne = train_test_split(valTestOne, test_size=VALSIZE)


sns.scatterplot(data=df,x='x1',y='x2',hue='labels',palette="bright")

### Generating Data Five

In [None]:
X = []
y = []
size = int(SAMPLESIZE/4)

X.extend(list(np.random.uniform(low=10, high=50, size=(size,))))
y.extend(list(np.random.uniform(low=48, high=50, size=(size,))))
c = list(np.zeros(size))

X.extend(list(np.random.uniform(low=25, high=35, size=(size,))))
y.extend(list(np.random.uniform(low=23, high=25, size=(size,))))
c.extend(np.ones(size))

X.extend(list(np.random.uniform(low=10, high=50, size=(size,))))
y.extend(list(np.random.uniform(low=33, high=35, size=(size,))))
c.extend(np.ones(size)*2)

X.extend(list(np.random.uniform(low=25, high=35, size=(size,))) )
y.extend(list(np.random.uniform(low=43, high=45, size=(size,))))
c.extend(np.ones(size)*3)


dfTwo = pd.DataFrame(data={'x1': X, 'x2': y,'labels':c})
dfTwo = shuffle(dfTwo)

trainTwo, valTestTwo = train_test_split(dfTwo, test_size=TESTSIZE)
valTwo, testTwo = train_test_split(valTestTwo, test_size=VALSIZE)
sns.scatterplot(data=dfTwo,x='x1',y='x2',hue='labels')


### Generating Data Six

In [None]:
size = int(SAMPLESIZE/3)
print(size)
X,y = make_circles(n_samples=(size*2), random_state=3, 
noise=0.04, factor = 0.3)

X1 = list(X[:, 0].flatten()*50)
X2 = list(X[:, 1].flatten()*50)

X = []
y = []


for x in range(0,len(X1)):

    if ((((X1[x] > -20) and (X1[x] < 20))) and (((X2[x] > -20) and (X2[x] < 20)))):
        n=1
    else:
        X.append(X1[x])
        y.append(X2[x])
        
Xc = list(np.zeros(len(X)))
Xc.pop
X.extend(list(np.random.uniform(low=-5, high=5, size=(size,))))
y.extend(list(np.random.uniform(low=-37, high=-30, size=(size,))))
Xc.extend(list(np.ones(size)))

X.extend(list(np.random.uniform(low=-5, high=5, size=(size,))))
y.extend(list(np.random.uniform(low=30, high=37, size=(size,))))
Xc.extend(list(np.ones(size)*2))

print(len( X))
print(len( y))
print(len( Xc))
dfThree= pd.DataFrame(data={'x1': X, 'x2': y,'labels':Xc})
sns.scatterplot(data=dfThree,x='x1',y='x2',hue='labels')


In [None]:
dfThree = shuffle(dfThree)

trainThree, valTestThree = train_test_split(dfThree, test_size=TESTSIZE)
valThree, testThree = train_test_split(valTestThree, test_size=VALSIZE)

### KNN Programmed

In [None]:
#KNN implementation
def KNN(X,y,k,labels,Xtest,ytest):

    X = list(X)
    y = list(y)
    Xtest = list(Xtest)
    ytest= list(ytest)
    labels = list(labels)

    d = OrderedDict()
    preds = {}

    #Go through test examples
    for index in range(0,len(Xtest)):

        #Go through train examples 
        for index2 in range(0,len(X)):

            #Ensure not same value
            if (index2 != index):

                #Compute eucledian distance - save this value to a dictionary with index
                d[str(index2)] = np.sqrt((Xtest[index]- X[index2])**2 + (ytest[index] - y[index2])**2)

        #Sort the eucledian distances
        sortedDistances = dict(sorted(d.items(), key=lambda item: item[1]))


        votes = []

        #Look at K nearest neighbor labels that are in the training data
        #Collect K nearest labels
        for number in range(0,k):
            votes.append(labels[int(list(sortedDistances.keys())[number])])
        
        #Set the label to the most "voted" label
        preds[str(index)] = Counter(votes).most_common(1)[0][0]
    return preds 

In [None]:
#Manually coded accuracy function
def checkAccuracy(pred,test):
    wrong = 0

    for idx in range(len(pred)):
        if (pred[idx] != test[idx]):
            wrong = wrong + 1
    return 1- wrong/len(pred)



In [None]:
#Function to check for best k value in an array of different k values for KNN
def bestK(dfTrain,dfTest):


    k = [1,3,5,7,9]
    accuracies = []

    #Try each k value
    for x in k:

        #Predict labels with KNN with the K 
        predicted = KNN(dfTrain.x1,dfTrain.x2,x,dfTrain.labels,dfTest.x1,dfTest.x2)

        #Save accuracy of this prediction
        accuracies.append(checkAccuracy(list(predicted.values()),list(dfTest.labels)))


    print("Best K: ", k[np.argmax(accuracies)])

    #Return the K value that gives the best accuracy tested
    return k[np.argmax(accuracies)]



In [None]:
def KNNAccuracy(train,valid,test):

    k = bestK(train,valid)
    predicted = KNN(train.x1,train.x2,k,train.labels,valid.x1,valid.x2)
    print("\nValidation Accuracy", checkAccuracy(list(predicted.values()),list(valid.labels)))
    predicted = KNN(train.x1,train.x2,k,train.labels,test.x1,test.x2)
    print("Test Accuracy", checkAccuracy(list(predicted.values()),list(test.labels)))

### KNN on Data

In [None]:
print("Dataset One")
KNNAccuracy(trainOne,valone,testOne)

print("\nDataset Two")
KNNAccuracy(trainTwo,valTwo,testTwo)

print("\nDataset Three")
KNNAccuracy(trainThree,valThree,testThree)

## Comparison of K-NN and Decision Tree

### Coding Decision Tree with Existing Frameworks

In [None]:
#Function to return accuracy of decision tree on data
def decisionTree(train,val,test):

    #Create decision tree
    clf = DecisionTreeClassifier(max_depth = 2)
    clf.fit(X=np.array(train.x1,train.x2).reshape(-1, 1),y=np.array(train.labels).reshape(-1, 1))

    y_pred = clf.predict(np.array(val.x1,val.x2).reshape(-1, 1))
    print("Validation Accuracy", accuracy_score(np.array(val.labels).reshape(-1, 1), y_pred))

    y_pred = clf.predict(np.array(test.x1,test.x2).reshape(-1, 1))
    print("Test Accuracy",accuracy_score(np.array(test.labels).reshape(-1, 1), y_pred))


### Decision Tree on Data vs K-NN

In [None]:
print("Dataset One")
print("\nDecision Tree")
decisionTree(trainOne,valone,testOne)

print("\nKNN")
KNNAccuracy(trainOne,valone,testOne)

print("\nDataset Two")
print("\nDecision Tree")
decisionTree(trainTwo,valTwo,testTwo)

print("\nKNN")
KNNAccuracy(trainTwo,valTwo,testTwo)

print("\nDataset Three")
print("\nDecision Tree")
decisionTree(trainThree,valThree,testThree)

print("\nKNN")
KNNAccuracy(trainThree,valThree,testThree)

## Deep Neural Networks

### Deep Neural Network Coding

In [None]:
def runModelOne(trainData,valData,testData):

    enc = OneHotEncoder()
    labels = np.array(trainData.labels).reshape(-1, 1)
    enc.fit(labels)  
    ytrain = enc.transform(labels).toarray()
    Xtrain = np.array([trainData.x1,trainData.x2]).T
    Xtest = np.array([testData.x1,testData.x2]).T
    XVal = np.array([valData.x1,valData.x2]).T

    encTwo = OneHotEncoder()
    labels = np.array(testData.labels).reshape(-1, 1)
    encTwo.fit(labels)  
    ytest = encTwo.transform(labels).toarray()

    
    encThree = OneHotEncoder()
    labels = np.array(valData.labels).reshape(-1, 1)
    encThree.fit(labels)  
    yval = encThree.transform(labels).toarray()

    model = Sequential()
    model.add(Dense(20, input_shape=(2,), activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(len(ytrain[0]), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(Xtrain, ytrain, epochs=160, batch_size=10,verbose=0)

    _, accuracy = model.evaluate(XVal, yval)
    print('Validation Accuracy: %.2f' % (accuracy*100))    

    _, accuracy = model.evaluate(Xtest, ytest)
    print('Test Accuracy: %.2f' % (accuracy*100))    
    
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
def runModelTwo(trainData,valData,testData):

    enc = OneHotEncoder()
    labels = np.array(trainData.labels).reshape(-1, 1)
    enc.fit(labels)  
    ytrain = enc.transform(labels).toarray()
    Xtrain = np.array([trainData.x1,trainData.x2]).T
    Xtest = np.array([testData.x1,testData.x2]).T
    XVal = np.array([valData.x1,valData.x2]).T

    encTwo = OneHotEncoder()
    labels = np.array(testData.labels).reshape(-1, 1)
    encTwo.fit(labels)  
    ytest = encTwo.transform(labels).toarray()

    
    encThree = OneHotEncoder()
    labels = np.array(valData.labels).reshape(-1, 1)
    encThree.fit(labels)  
    yval = encThree.transform(labels).toarray()

    model = Sequential()
    model.add(Dense(20, input_shape=(2,), activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(8, activation='relu'))  
    model.add(Dense(8, activation='relu'))  
    model.add(Dense(len(ytrain[0]), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(Xtrain, ytrain, epochs=160, batch_size=10,verbose=0)

    _, accuracy = model.evaluate(XVal, yval)
    print('Validation Accuracy: %.2f' % (accuracy*100))    

    _, accuracy = model.evaluate(Xtest, ytest)
    print('Test Accuracy: %.2f' % (accuracy*100))    
    
    #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)


### Deep Neural Networks Performance on Data

#### Model One

In [None]:
runModelOne(trainOne,valone, testOne)

In [None]:
runModelOne(trainTwo,valTwo,testTwo)

In [None]:
runModelOne(trainThree,valThree,testThree)

#### Model Two

In [None]:
runModelTwo(trainOne,valone, testOne)

In [None]:
runModelTwo(trainTwo,valTwo,testTwo)

In [None]:
runModelTwo(trainThree,valThree,testThree)

## Noisy Data Generation

### Different Noise Levels 

I chose to create Gaussian noise here. I define a function to generate a % of gaussian noise on some data and return noisyData

In [None]:
def noisySet(data,noiseLevel):
    guess = np.array([data.x1,data.x2])

    noise = np.random.normal(1,noiseLevel, guess.shape)
    new_signal = guess + noise

    # plt.scatter(new_signal[0],new_signal[1],c=data.labels)
    new_signal = pd.DataFrame(data={'x1': new_signal[0], 'x2':  new_signal[1],'labels':data.labels})
    return new_signal

### Generate Data for One and Two + Classification 

Per dataset, do 5%, 10%, 20%, and 25% noise.

In [72]:
#Function to generate noisy train,val,test data then run a decision tree + KNN on these
def noiseDataModel(train,val,test,noise):

    
    for types in noise: 
        noiseTrain= noisySet(train,types)
        noiseVal = noisySet(val,types)
        noiseTest = noisySet(test,types)

        print("\nNoise Level:", (float(types)*99))
        print("\nDecision Tree")
        decisionTree(noiseTrain,noiseVal,noiseTest)
    
        print("\nKNN")
        KNNAccuracy(noiseTrain,noiseVal,noiseTest)


In [82]:
#Define the noise levels we want
noiseLevels = [0.05,0.10,0.20,0.25]

print("Dataset One")
noiseDataModel(trainOne,valone,testOne,noiseLevels)

Dataset One

Noise Level: 4.95

Decision Tree
Validation Accuracy 0.6111111111111112
Test Accuracy 0.5238095238095238

KNN
Best K:  1

Validation Accuracy 0.9518518518518518
Test Accuracy 0.9587301587301588

Noise Level: 9.9

Decision Tree
Validation Accuracy 0.5037037037037037
Test Accuracy 0.4857142857142857

KNN
Best K:  3

Validation Accuracy 0.8777777777777778
Test Accuracy 0.8380952380952381

Noise Level: 19.8

Decision Tree
Validation Accuracy 0.5037037037037037
Test Accuracy 0.4857142857142857

KNN
Best K:  7

Validation Accuracy 0.7592592592592593
Test Accuracy 0.7158730158730159

Noise Level: 24.75

Decision Tree
Validation Accuracy 0.5037037037037037
Test Accuracy 0.4857142857142857

KNN
Best K:  7

Validation Accuracy 0.7037037037037037
Test Accuracy 0.6650793650793652


In [84]:
print("Dataset Two")
factoredUpNoise = [x * 10 for x in noiseLevels]
noiseDataModel(trainTwo,valTwo,testTwo,factoredUpNoise)

Dataset Two

Noise Level: 49.5

Decision Tree
Validation Accuracy 0.4074074074074074
Test Accuracy 0.37142857142857144

KNN
Best K:  1

Validation Accuracy 0.9925925925925926
Test Accuracy 0.9936507936507937

Noise Level: 99.0

Decision Tree
Validation Accuracy 0.4444444444444444
Test Accuracy 0.4095238095238095

KNN
Best K:  1

Validation Accuracy 0.9777777777777777
Test Accuracy 0.9873015873015873

Noise Level: 198.0

Decision Tree
Validation Accuracy 0.3851851851851852
Test Accuracy 0.3873015873015873

KNN
Best K:  5

Validation Accuracy 0.8666666666666667
Test Accuracy 0.9015873015873016

Noise Level: 247.5

Decision Tree
Validation Accuracy 0.4
Test Accuracy 0.37777777777777777

KNN
Best K:  3

Validation Accuracy 0.8666666666666667
Test Accuracy 0.8857142857142857
