[View in Colaboratory](https://colab.research.google.com/github/ntuananh/CS582_MachineLearning/blob/master/MLProject.ipynb)

https://www.youtube.com/watch?v=XOEN9W05_4A - video demo for capturing data

---

In [0]:
#@title Setup Enviroment & Import Data
# Read data from Google Drive
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#List files
#file_list = drive.ListFile({'q': "'11f05x7C5kbWdpTdRJVwbPgV7MOG_D6fv' in parents and trashed=false"}).GetList()
#for file1 in file_list:
#  print('title: %s, id: %s' % (file1['title'], file1['id']))
  
# create Files and pull content to Colab
train_N = drive.CreateFile({'id': '1qAj6Hqqoss0bAuYuVWGSoOZgmvMkZ6k2'})
train_N.GetContentFile('train_N.csv')
test_N = drive.CreateFile({'id': '1hv9hqdvvTMJGPPqMTsd0YS58_0nio97G'})
test_N.GetContentFile('test_N.csv')

train = drive.CreateFile({'id': '1mXbabpIXPB_57pOvam9bjKOBVnQAoeML'})
train.GetContentFile('train.csv')
test = drive.CreateFile({'id': '1JjEDt8q_VPWfVCN7MOexIwY1I1GtTLfC'})
test.GetContentFile('test.csv')

# read files'content into dataframe
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data_N = pd.read_csv('train_N.csv')
test_data_N = pd.read_csv('test_N.csv')

In [0]:
#@title Import libraries
#import Libs
import sklearn
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
import seaborn as sns
import matplotlib.pyplot as plt  

In [0]:
#@title KDTree
#https://en.wikipedia.org/wiki/K-d_tree
  
from collections import namedtuple
from operator import itemgetter
from pprint import pformat

class Node(namedtuple('Node', 'location left_child right_child')):
    def __repr__(self):
        return pformat(tuple(self))

def kdtree(point_list, depth=0):
    try:
        k = len(point_list[0]) # assumes all points have the same dimension
    except IndexError as e: # if not point_list:
        return None
    # Select axis based on depth so that axis cycles through all valid values
    axis = depth % k
 
    # Sort point list and choose median as pivot element
    point_list.sort(key=itemgetter(axis))
    median = len(point_list) // 2 # choose median
 
    # Create node and construct subtrees
    return Node(
        location=point_list[median],
        left_child=kdtree(point_list[:median], depth + 1),
        right_child=kdtree(point_list[median + 1:], depth + 1)
    )

def main():
    """Example usage"""
    point_list = [(2,3), (5,4), (9,6), (4,7), (8,1), (7,2)]
    tree = kdtree(point_list)
    print(tree)

if __name__ == '__main__':
    main()

In [0]:
#@title KNearestNeighbour Class
# Custom KNN 
import time
import collections

class KNearestNeighbour:
    def __init__(self, data: pd.DataFrame, label_column: str, data_dimension: int=-1):
        self.label_column = label_column
        self.labels = data.pop(label_column).tolist()
        self.properties = data.values.tolist()
        self.size = len(self.labels)
        # as a rule of thumb implemented in many library, but may not yield an outstanding result compare to others
        self.max_k = int(self.size ** 0.5)
        self.data_dimension = data_dimension
        if data_dimension > 0:
            red_dim = self.pca(self.properties, data_dimension)
            self.properties = red_dim[0]
            self.mean = red_dim[1]
            self.ev = red_dim[2]

    @classmethod
    def euclidean_distance(cls, p1: list, p2: list) -> float:
        return sum(np.square(np.array(p1) - np.array(p2))) ** 0.5

    @classmethod
    def manhattan_distance(cls, p1: list, p2: list) -> float:
        return sum(np.abs(np.array(p1) - np.array(p2)))

    def analyze(self, test_data: list) -> list:
        if self.data_dimension > 0:
            test_data = np.transpose(np.dot(np.transpose(self.ev), np.transpose(np.array(test_data) - self.mean)))
        r = []
        for i in range(self.size):
            r.append((KNearestNeighbour.manhattan_distance(test_data, self.properties[i]), i))
        r.sort(key=lambda e: e[0])
        return [self.labels[r[i][1]] for i in range(min(self.max_k, len(r)))]

    @classmethod
    def verdict(cls, nearest_neighbours: list, k: int):
        return collections.Counter(nearest_neighbours[:min(k, len(nearest_neighbours))]).most_common(1)[0][0]

    @classmethod
    def explain(cls, nearest_neighbours: list, k: int):
        d = dict(collections.Counter(nearest_neighbours[:min(k, len(nearest_neighbours))]).most_common())
        return {e: d[e]/k for e in d}

    @classmethod
    def pca(cls, data, n_components):
        mean = np.mean(data, axis=0)
        data -= mean
        cov_matrix = np.cov(np.transpose(data))

        eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)
        indices = np.argsort(eigen_values)[::-1]
        eigen_vectors = eigen_vectors[:, indices]
        eigen_vectors = eigen_vectors[:, :n_components]
        for i in range(np.shape(eigen_vectors)[1]):
            eigen_vectors[:, i] /= np.linalg.norm(eigen_vectors[:, i])

        transformed_data = np.transpose(np.dot(np.transpose(eigen_vectors), np.transpose(data)))
        # original_data = np.transpose(np.dot(evecs, x)) + m
        return transformed_data, mean, eigen_vectors

    @classmethod
    def test_model(cls, model, k: int, tests: pd.DataFrame, build_confusion_matrix: bool=False):
        expected_labels = tests.pop(model.label_column)
        tests = tests.values.tolist()

        success_count = 0
        failed_tests = []
        total_time = 0

        confusion_matrix = None
        labels_mapping = None
        if build_confusion_matrix:
            labels = np.unique(np.array(knn.labels))
            confusion_matrix = np.zeros((len(labels), len(labels)))
            labels_mapping = {l: i for i, l in enumerate(labels)}

        for i, test in enumerate(tests):
            start = time.clock()
            ar = knn.analyze(test)
            result = KNearestNeighbour.verdict(ar, k)
            end = time.clock()
            total_time += end - start
            if expected_labels[i] == result:
                success_count += 1
            else:
                failed_tests.append((test, KNearestNeighbour.explain(ar, k), expected_labels[i]))
            if build_confusion_matrix:
                confusion_matrix[labels_mapping[expected_labels[i]], labels_mapping[result]] += 1

        success_rate = success_count / len(tests)

        return total_time, success_rate, failed_tests, confusion_matrix, labels_mapping

In [0]:
#@title KNearestNeighbour with PCA=10, KNN=3, Accu=0.969
# Run custom KNN

train = train_data_N.copy()
test  = test_data_N.copy()

knn = KNearestNeighbour(train, 'activity', 10)
t = KNearestNeighbour.test_model(knn, 3, test, build_confusion_matrix=True)

print('Acuracy')
print(t[1])

print('Confusion Maxtrix')
print(t[3])
sns.heatmap(t[3])

In [0]:
#@title KNearestNeighbour with PCA=1, KNN=1, Accuracy= 0.927
knn = KNearestNeighbour(data, 'activity', 1)
t = KNearestNeighbour.test_model(knn, 1, tests, build_confusion_matrix=True)

print('Acuracy')
print(t[1])

print('Confusion Maxtrix')
print(t[3])
sns.heatmap(t[3])

In [0]:
#@title KNeighborsClassifier PCA=10, KNN=3, Accu=0.971
#Run scikitlearn with pca, KNN

# split out activity (target column) to Y
train = train_data_N.copy()
test  = test_data_N.copy()
Y = train.pop('activity')
X = train

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

#Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X1 = pca.fit_transform(X_train)


#-------------
#default training method
knn_model = KNeighborsClassifier(algorithm='brute',n_neighbors=3)
knn_model.fit(X1, Y_train)

#test predict
pca = PCA(n_components=10)
actualResult = test.pop('activity')

Xtest = pca.fit_transform(test)

predictions = knn_model.predict(Xtest)
#print(predictions)
knn_model.score(X1,Y_train)
# 0.9866999168744804

# Making the Confusion Matrix

cm = accuracy_score(actualResult, predictions)
print(cm)



In [0]:
#@title KNearestNeighbour table { display-mode: "code" }
# Run custom KNN

train = train_data_N.copy()
test  = test_data_N.copy()

# pca=500,knn=3
knn = KNearestNeighbour(train, 'activity', 500)
t = KNearestNeighbour.test_model(knn, 3, test, build_confusion_matrix=True)

print('Acuracy:')
print(t[1])
print('Confusion Maxtrix')
print(t[3])

train = train_data_N.copy()
test  = test_data_N.copy()

# pca=250,knn=3
knn = KNearestNeighbour(train, 'activity', 250)
t = KNearestNeighbour.test_model(knn, 3, test, build_confusion_matrix=True)

print('Acuracy:')
print(t[1])
print('Confusion Maxtrix')
print(t[3])


train = train_data_N.copy()
test  = test_data_N.copy()

# pca=100,knn=3
knn = KNearestNeighbour(train, 'activity', 100)
t = KNearestNeighbour.test_model(knn, 3, test, build_confusion_matrix=True)

print('Acuracy:')
print(t[1])
print('Confusion Maxtrix')
print(t[3])


train = train_data_N.copy()
test  = test_data_N.copy()

# pca=50,knn=3
knn = KNearestNeighbour(train, 'activity', 50)
t = KNearestNeighbour.test_model(knn, 3, test, build_confusion_matrix=True)

print('Acuracy:')
print(t[1])
print('Confusion Maxtrix')
print(t[3])


train = train_data_N.copy()
test  = test_data_N.copy()

# pca=10,knn=3
knn = KNearestNeighbour(train, 'activity', 10)
t = KNearestNeighbour.test_model(knn, 3, test, build_confusion_matrix=True)

print('Acuracy:')
print(t[1])
print('Confusion Maxtrix')
print(t[3])


# Backup

##Upload files to Colab
```
from google.colab import files
files.upload()
files.os.listdir()
```

##Download dataset from Kaggle
```
!pip install kaggle
!mkdir .kaggle

import json
token = {"username":"nguyentuananh","key":"44e2c22b301712101fdafe3d03ff7898"}
with open('.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
!chmod 600 .kaggle/kaggle.json

!kaggle datasets download -d mboaglio/simplifiedhuarus
```

In [0]:
#@title
!pip install umap-learn
#First we reduce original train_data with UMAP
#Then run the fit with KNN model

train_data = pd.read_csv('train.csv')
y = train_data.pop('activity')
x = train_data

# initialize UMAP
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(x)

# fit model based on the embedding
knn_model = KNeighborsClassifier(algorithm='brute',n_neighbors=3)
knn_model.fit(embedding,y)

test_data = pd.read_csv('test.csv')
#????not sure trying to transform the test data before testing is correct, but just give it a try
embedding_test = reducer.fit_transform(test_data)

predictions = knn_model.predict(embedding_test)
print(predictions)
#not as expected - FAILED
#how to use the test data, if the original train data already reduced?
knn_model.score(embedding,y)
#0.9542809642560266 - not as expected - FAILED

In [0]:
#@title
!ls -a
#cd ..
#!pwd
#cd /content
