In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
import random as rd
import glob

In [9]:
allFiles = glob.glob("data/*_va3.csv")
df = pd.DataFrame()
list_ = []

for file_ in allFiles:
    df = pd.read_csv(file_, index_col=None, header=0, sep=',')
    list_.append(df)
    
df = pd.concat(list_, sort = True)
#df.head()
len(df)

In [11]:
y = df['Phase'].values.tolist()
#type(y); y.describe(); y.head(3); y.shape; y.unique() # (165632, )
dic = {'D':0, 'P':1, 'S':2, 'H':3, 'R':4}
y = [dic.get(n, n) for n in y]

In [12]:
X = df.drop(labels=['Phase'], axis=1, inplace = True)
X = df.astype(float)
X = df.values
X

array([[ 1.93940e-04, -1.48040e-04,  1.18210e-04, ...,  1.19730e-04,
        -5.71800e-05, -8.19000e-05],
       [ 2.54420e-04, -1.08820e-04,  5.33600e-05, ...,  1.68180e-04,
        -1.42560e-04, -9.39600e-05],
       [ 2.76360e-04, -5.26300e-05, -3.34800e-05, ...,  1.94360e-04,
        -1.97630e-04, -9.29200e-05],
       ...,
       [ 3.07408e-03,  4.13420e-04, -1.13780e-04, ...,  2.78558e-03,
         5.03483e-03, -6.06410e-04],
       [ 3.29680e-03,  5.79910e-04, -3.80930e-04, ...,  2.94716e-03,
         5.38513e-03, -6.51760e-04],
       [ 2.04320e-04,  6.44270e-04, -4.80730e-04, ...,  2.28860e-04,
         3.19000e-06,  6.14900e-05]])

In [13]:
class K_Means:
    def __init__(self, k=5, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter

    def fit(self,data):

        self.centroids = {}

        for i in range(self.k):
            self.centroids[i] = data[i]

        for i in range(self.max_iter):
            self.classifications = {}

            for i in range(self.k):
                self.classifications[i] = []

            for featureset in data:
                distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)

            prev_centroids = dict(self.centroids)

            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification],axis=0)

            optimized = True

            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
                    #print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
                    optimized = False

            if optimized:
                break

    def predict(self,data):
        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

6911
2962
6911
2962


In [15]:
clf = K_Means()
clf.fit(X_train)
correct = 0
for i in range(len(X_test)):
    predict_me = np.array(X_test[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    #print("Predict: ", prediction, " || Actual: ", y_test[i])
    if prediction == y_test[i]:
        correct += 1

print("Accuracy",correct/len(X_test))

Accuracy 0.3649561107359892
