In [1]:
import pandas as pd
import sklearn
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import manifold
from sklearn.preprocessing import StandardScaler  
import matplotlib.pyplot as plt

import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from sklearn.model_selection import cross_val_score

init_notebook_mode(connected=True)

# Jaki procent całego zbioru ma być wykorzystany do uczenia
TRAIN_DATASET_PRECENTAGE = 0.5

##
## 11 - running
## 22 - walking
## 33 - car
## 44 - bike
## 55 - upstairs
## 66 - downstairs

In [2]:
# Załadowanie zbioru cech z csv
dataset = pd.read_csv("features.csv", index_col=None)

# Stworzenie kolumn dla wektora cech. Nazwy to liczby od 1 do 25.
cols = [str(x) for x in range(1, 26)]
#Ostatnia kolumna to etykieta aktywności
cols.append("label")

dataset.columns = cols


for label in (11,22,33,44,55,66):
    print(label, dataset.loc[dataset["label"] == label].shape)

## 11 - running
## 22 - walking
## 33 - car
## 44 - bike
## 55 - upstairs
## 66 - downstairs

11 (845, 26)
22 (881, 26)
33 (1377, 26)
44 (53, 26)
55 (258, 26)
66 (91, 26)


In [3]:
# Mieszamy wczytane wiersze tak, żeby kolejność cech nie miała wpływu na trenowanie i testowanie
dataset = sklearn.utils.shuffle(dataset)

In [4]:
# Obliczenie liczby wektorów cech potrzebynch do uczenia
train_size = int(len(dataset)*TRAIN_DATASET_PRECENTAGE)

#Podział na zbiór uczący i testowy
train_dataset = dataset[:train_size]
test_dataset = dataset[train_size:]

# Oddzielamy cechy od etykiety i przypisujemy do X - zbioru danych uczących
feature_cols = dataset.columns[:-1]
X = train_dataset.loc[:, feature_cols]

# Zbiór poprawnych etykiet dla zbioru uczącego
y = train_dataset.label

In [5]:
# Skalowanie wektora cech
scaler = StandardScaler()
scaler.fit(X)

X = scaler.transform(X)

In [6]:
# Podział na X i y dla danych testowych
X_test = test_dataset.loc[:, feature_cols]
X_test = scaler.transform(X_test)
y_test = test_dataset.label

In [194]:
# Tworzymy listę klasyfikatorów które testujemy
classifiers = []

classifiers.append({"name":"SVM", "cls":SVC(C=11)})
classifiers.append({"name":"MLP", "cls":MLPClassifier(solver='adam', alpha=1e-5, 
                                                      hidden_layer_sizes=(200, 200), 
                                                      max_iter=2500, shuffle=True)})
classifiers.append({"name":"KNN", "cls":KNeighborsClassifier(n_neighbors=3)})
classifiers.append({"name":"RF", "cls":RandomForestClassifier()})


In [195]:
# Cross validate

X_cv = np.concatenate((X, X_test))
y_cv = np.concatenate((y, y_test))

cv = 2

print("Cross validation folds=%d" % (cv))

for clsf in classifiers:
    scores = cross_val_score(clsf["cls"], X_cv, y_cv, cv=cv)
    print(clsf["name"] + ": %0.2f" % (scores.mean()*100) + "%")
    print(scores)

Cross validation folds=2
SVM: 86.25%
[0.86552707 0.85942857]
MLP: 87.22%
[0.87464387 0.86971429]
KNN: 80.88%
[0.80569801 0.812     ]
RF: 81.23%
[0.81538462 0.80914286]


In [196]:
# Każdy klasyfikator uczymy i testujemy, a następnie rysujemy heatmap z wynikami
for clsf in classifiers:
    # Uczenie klasyfikatora
    clsf["cls"].fit(X, y)
    # Sprawdzenie dla danych testowych
    predictions = clsf["cls"].predict(X_test)
    # Zapis wyniku
    acc = accuracy_score(y_test, predictions)
    
    # Znormalizowana macierz
    c_matrix = confusion_matrix(y_test, predictions, labels = [11,22,33,44,55,66])
    c_matrix = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis]
    c_matrix = c_matrix*100.0
    c_matrix = c_matrix.round(2)
    
    # Tytuły wykresów
    title = clsf["name"] + " - overall accuracy: " + str(round(acc*100,2)) + "%"
    labels = ["running", "walking", "car", "bike", "upstairs", "downstairs"]
    
    fig = ff.create_annotated_heatmap(z=c_matrix, x=labels, y=labels)
    fig.layout.update({"title": title})
    
    fig.layout.update({"yaxis":dict(autorange='reversed')})
    
    iplot(fig)


In [10]:
# # clf = SVC()
# clf = MLPClassifier(solver='lbfgs', alpha=1e-7, hidden_layer_sizes=(26, 10, 6), random_state=1, max_iter=700)
# # clf = KNeighborsClassifier(n_neighbors=7)
# # clf = SGDClassifier(loss="log")
# clf.fit(X,y)

# predictions = clf.predict(X_test)
# acc = accuracy_score(y_test, predictions)
# print("Accuracy: " + str(round(acc*100,2)) + "%")

In [11]:
# tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
# Y = tsne.fit_transform(X_test)

In [12]:
# plt.scatter(Y[:, 0], Y[:, 1], c=y_test, cmap=plt.cm.get_cmap("jet", len(list(set(y_test)))))
# plt.colorbar(ticks=range(len(list(set(y_test)))))

# plt.show()