# Classification attempt using ch7_classification

In [3]:
import sys
# !conda install --yes --prefix {sys.prefix} scikit-learn
# !conda install --yes --prefix {sys.prefix} xgboost
# !conda install --yes --prefix {sys.prefix} seaborn
# !conda install --yes --prefix {sys.prefix} matplotlib

%matplotlib inline

In [4]:
import os
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from util import util
from util.VisualizeDataset import VisualizeDataset

In [5]:
dataset = pd.read_csv("./A3_dataset/COMBINED_ch4_result.csv")
print(dataset.columns)
dataset.head()

Index(['Unnamed: 0', 'acc_x', 'acc_y', 'acc_z', 'grav_x', 'grav_y', 'grav_z',
       'lin_x', 'lin_y', 'lin_z', 'gyr_x', 'gyr_y', 'gyr_z',
       'time_since_start', 'timestamp_date', 'timestamp_ms', 'label-jogging',
       'label-cycling', 'label-football', 'label-jumprope', 'label-pushups',
       'label-sitting', 'label-swimming', 'label-tennis', 'label-walking',
       'lin_x_max_freq_ws20', 'lin_y_max_freq_ws20', 'lin_z_max_freq_ws20',
       'lin_x_max_freq_ws40', 'lin_y_max_freq_ws40', 'lin_z_max_freq_ws40'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,acc_x,acc_y,acc_z,grav_x,grav_y,grav_z,lin_x,lin_y,lin_z,...,label-sitting,label-swimming,label-tennis,label-walking,lin_x_max_freq_ws20,lin_y_max_freq_ws20,lin_z_max_freq_ws20,lin_x_max_freq_ws40,lin_y_max_freq_ws40,lin_z_max_freq_ws40
0,0,5.0087,3.5865,6.0478,3.0025,4.0308,8.4207,2.0062,-0.4442,-2.373,...,0,0,0,0,,,,,,
1,1,9.2979,-9.0417,3.4058,6.11765,3.60885,3.77155,1.2018,2.6037,9.4019,...,0,0,0,0,,,,,,
2,2,13.5871,-21.6699,0.7638,9.2328,3.1869,-0.8776,22.8759,-0.5101,20.9578,...,0,0,0,0,,,,,,
3,3,5.62875,0.4956,10.9619,8.4778,-1.9642,-4.5211,4.7023,23.7443,25.207,...,0,0,0,0,,,,,,
4,4,-2.3296,22.6611,21.16,8.3773,-2.7779,-4.2748,1.3718,25.8676,28.1546,...,0,0,0,0,,,,,,


# Knn classification

In [6]:
from sklearn.neighbors import KNeighborsClassifier

# Making a single label column
labels = ['label-jogging','label-cycling', 'label-football', 'label-jumprope', 'label-pushups', 'label-sitting', 'label-swimming', 'label-tennis', 'label-walking']
activities = ['jogging','cycling', 'football', 'jumprope', 'pushups', 'sitting', 'swimming', 'tennis', 'walking']

activity_labels = []

for i in range(len(dataset)):
    for j in range(len(labels)):
        if dataset[labels[j]][i] == 1:
            activity_labels.append(activities[j])


In [7]:
from collections import Counter
Counter(activity_labels)

Counter({'jogging': 303,
         'cycling': 6764,
         'football': 9549,
         'jumprope': 2902,
         'pushups': 418,
         'sitting': 17195,
         'swimming': 3034,
         'tennis': 14360,
         'walking': 1435})

In [8]:
# Quickly impute every instance of missing values with median

dataset_filled = dataset.copy()
dataset_columns = dataset_filled.columns
dataset_filled[dataset_columns] = dataset_filled[dataset_columns].fillna(dataset_filled[dataset_columns].mean())

  dataset_filled[dataset_columns] = dataset_filled[dataset_columns].fillna(dataset_filled[dataset_columns].mean())


In [9]:
dataset_filled = dataset_filled.drop(labels + ["Unnamed: 0", "timestamp_date", "timestamp_ms", "time_since_start", 

       # 'lin_x_max_freq_ws20', 'lin_y_max_freq_ws20', 'lin_z_max_freq_ws20', 'lin_x_max_freq_ws40',
       # 'lin_y_max_freq_ws40', 'lin_z_max_freq_ws40'
       
       ], axis=1)

In [10]:
dataset_filled.columns

Index(['acc_x', 'acc_y', 'acc_z', 'grav_x', 'grav_y', 'grav_z', 'lin_x',
       'lin_y', 'lin_z', 'gyr_x', 'gyr_y', 'gyr_z', 'lin_x_max_freq_ws20',
       'lin_y_max_freq_ws20', 'lin_z_max_freq_ws20', 'lin_x_max_freq_ws40',
       'lin_y_max_freq_ws40', 'lin_z_max_freq_ws40'],
      dtype='object')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(dataset_filled, activity_labels, test_size=0.33, shuffle=True)

In [12]:
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
ypreds=knn_clf.predict(X_test)

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sn
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(list(ypreds), y_test)
df_cm = pd.DataFrame(conf_matrix, index = activities, columns = activities)

plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, cmap="BuPu")
plt.savefig("./daim_figures/knn_classification.png")
plt.show()

In [14]:
conf_matrix

array([[1969,   95,   21,  101,   17,    4,   36,   89,   12],
       [  73, 2490,   30,   88,    0,    4,    9,  488,   99],
       [   2,    3,   16,    4,    0,    0,    0,    1,    0],
       [  27,   54,   10,  675,    1,    2,   14,   60,   11],
       [   5,    4,    2,    4,  114,    3,   10,   10,    7],
       [   3,    6,    0,    0,    0, 5698,    9,    1,    1],
       [  10,    6,    6,    9,    9,    2,  900,    6,    3],
       [  59,  450,   12,   92,    3,    0,   12, 4024,   42],
       [   5,   57,    3,   17,    2,    1,    7,   30,  318]])

In [28]:
print(classification_report(y_test, ypreds, target_names=activities))

              precision    recall  f1-score   support

     jogging       0.66      0.47      0.55      2153
     cycling       0.52      0.86      0.65      3165
    football       0.14      0.67      0.23       100
    jumprope       0.48      0.12      0.19       990
     pushups       0.25      0.84      0.38       146
     sitting       1.00      0.96      0.98      5714
    swimming       0.83      0.71      0.76       997
      tennis       0.79      0.59      0.67      4709
     walking       0.36      0.42      0.39       493

    accuracy                           0.72     18467
   macro avg       0.56      0.63      0.53     18467
weighted avg       0.76      0.72      0.72     18467



# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf= RandomForestClassifier()
clf.fit(X_train,y_train)
ypreds=clf.predict(X_test)

In [18]:
conf_matrix = confusion_matrix(list(ypreds), y_test)
df_cm = pd.DataFrame(conf_matrix, index = activities, columns = activities)

plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, cmap="BuPu")
plt.savefig("./daim_figures/r_forest_classification.png")
plt.show()

In [19]:
conf_matrix

array([[1973,   30,   23,   79,   16,   14,   29,   36,    7],
       [  27, 2813,   17,   74,    0,    2,    4,  271,   51],
       [   0,    0,   25,    0,    0,    0,    0,    0,    0],
       [  15,   12,    2,  691,    0,    0,    0,   27,    8],
       [   4,    1,    0,    1,  104,    4,    3,    6,   11],
       [   1,    0,    0,    0,    0, 5693,    2,    1,    0],
       [  15,    2,    8,   11,   21,    1,  948,    3,    2],
       [ 108,  302,   18,  122,    3,    0,    8, 4353,   40],
       [  10,    5,    7,   12,    2,    0,    3,   12,  374]])

In [None]:
print(classification_report(y_test, ypreds, target_names=activities))

# Boosting

In [20]:
from xgboost import XGBClassifier

clf = XGBClassifier()
clf.fit(X_train,y_train)
ypreds=clf.predict(X_test)

  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




In [21]:
conf_matrix = confusion_matrix(list(ypreds), y_test)
df_cm = pd.DataFrame(conf_matrix, index = activities, columns = activities)

plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, cmap="BuPu")
plt.savefig("./daim_figures/xgboost_classification.png")
plt.show()

In [22]:
conf_matrix

array([[1988,   31,   10,   71,   17,    8,   25,   33,    8],
       [  28, 2754,   10,   60,    1,    2,    1,  319,   28],
       [   1,    4,   46,    2,    0,    0,    0,    1,    0],
       [  26,   17,    4,  727,    0,    0,    3,   30,    7],
       [   3,    0,    0,    2,  103,    1,    5,    1,    4],
       [   0,    2,    1,    1,    0, 5697,    4,    1,    0],
       [   8,    2,    3,    7,   20,    2,  945,    3,    0],
       [  93,  343,   19,  110,    3,    2,   11, 4295,   27],
       [   6,   12,    7,   10,    2,    2,    3,   26,  419]])

In [None]:
print(classification_report(y_test, ypreds, target_names=activities))

# Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train).predict(X_test)
ypreds= gnb.predict(X_test)

In [24]:
conf_matrix = confusion_matrix(list(ypreds), y_test)
df_cm = pd.DataFrame(conf_matrix, index = activities, columns = activities)

plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, cmap="BuPu")
plt.savefig("./daim_figures/NB_classification.png")
plt.show()

In [None]:
conf_matrix

In [None]:
print(classification_report(y_test, ypreds, target_names=activities))

# SVM

In [32]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
ypreds= clf.predict(X_test)

In [33]:
conf_matrix = confusion_matrix(list(ypreds), y_test)
df_cm = pd.DataFrame(conf_matrix, index = activities, columns = activities)

plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, cmap="BuPu")
plt.savefig("./daim_figures/SVM_classification.png")
plt.show()

In [34]:
conf_matrix

array([[1835,   72,    7,   85,   19,   22,   38,   38,    6],
       [  37, 2434,   54,   96,    4,    6,    7,  535,   59],
       [   1,    0,   19,    0,    0,    0,    3,    0,    0],
       [  10,   15,    4,  552,    0,    0,    2,   28,    6],
       [   4,    0,    0,    1,   78,    3,    4,    2,    7],
       [  20,   17,    1,    0,    0, 5679,    7,    5,    0],
       [  62,    9,    1,   18,   30,    1,  913,    3,    6],
       [ 176,  602,    7,  230,    1,    3,   18, 4066,   40],
       [   8,   16,    7,    8,   14,    0,    5,   32,  369]])

In [35]:
print(classification_report(y_test, ypreds, target_names=activities))

              precision    recall  f1-score   support

     jogging       0.86      0.85      0.86      2153
     cycling       0.75      0.77      0.76      3165
    football       0.83      0.19      0.31       100
    jumprope       0.89      0.56      0.69       990
     pushups       0.79      0.53      0.64       146
     sitting       0.99      0.99      0.99      5714
    swimming       0.88      0.92      0.90       997
      tennis       0.79      0.86      0.83      4709
     walking       0.80      0.75      0.78       493

    accuracy                           0.86     18467
   macro avg       0.84      0.71      0.75     18467
weighted avg       0.87      0.86      0.86     18467

