## Data Cleaning

In [1]:
import os, 
import pandas as pd
import numpy as np

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder

In [3]:
files_input = ['Activity-Data/Elevator & Stair/1/1Activity.csv', 'Activity-Data/Elevator & Stair/2/2Activity.csv',
         'Activity-Data/Elevator & Stair/3/3Activity.csv', 'Activity-Data/Elevator & Stair/4/4Activity.csv', 
         'Activity-Data/Elevator & Stair/5/5Activity.csv']
files_labels = ['Activity-Data/Elevator & Stair/1/1ActivityLog.csv', 'Activity-Data/Elevator & Stair/2/2ActivityLog.csv',
         'Activity-Data/Elevator & Stair/3/3ActivityLog.csv', 'Activity-Data/Elevator & Stair/4/4ActivityLog.csv', 
         'Activity-Data/Elevator & Stair/5/5ActivityLog.csv']

In [8]:
# usecols=['pressureDerivative','pressureIQR', 'timestamp']
input_list = []
for fn in files_input:
    df = pd.read_csv(fn, delimiter=',', usecols = [12,13,26], header=0)
    input_list.append(df)
input_frame = pd.DataFrame()
input_frame = pd.concat(input_list)
input_frame.head()

Unnamed: 0,pressureDerivative,pressureIQR,timestamp
0,-2.035256,1.695796,1498222478520
1,-1.107091,1.26496,1498222479928
2,-0.652795,-0.095003,1498222481447
3,-0.263341,-0.778249,1498222482915
4,0.099831,-0.887046,1498222484432


In [11]:
output_list = []
for fn in files_labels:
    df = pd.read_csv(fn, delimiter=',', header=0)
    output_list.append(df)
output_frame = pd.DataFrame()
output_frame = pd.concat(output_list)
output_frame.head()

Unnamed: 0,prediction,timestamp
0,walking,1498222478583
1,walking,1498222479933
2,walking,1498222481452
3,walking,1498222482917
4,walking,1498222484436


In [12]:
len(input_frame),len(output_frame)

(212, 212)

In [13]:
pd.value_counts(output_frame['prediction'])

walking downstairs    174
walking                38
Name: prediction, dtype: int64

In [21]:
data = pd.concat([input_frame,output_frame], axis=1)
data.head()

Unnamed: 0,pressureDerivative,pressureIQR,timestamp,prediction,timestamp.1
0,-2.035256,1.695796,1498222478520,walking,1498222478583
1,-1.107091,1.26496,1498222479928,walking,1498222479933
2,-0.652795,-0.095003,1498222481447,walking,1498222481452
3,-0.263341,-0.778249,1498222482915,walking,1498222482917
4,0.099831,-0.887046,1498222484432,walking,1498222484436


In [26]:
data['prediction'] = data['prediction'].astype('category')

In [29]:
data['prediction'] = data['prediction'].cat.codes

In [30]:
data.head()

Unnamed: 0,pressureDerivative,pressureIQR,timestamp,prediction,timestamp.1
0,-2.035256,1.695796,1498222478520,0,1498222478583
1,-1.107091,1.26496,1498222479928,0,1498222479933
2,-0.652795,-0.095003,1498222481447,0,1498222481452
3,-0.263341,-0.778249,1498222482915,0,1498222482917
4,0.099831,-0.887046,1498222484432,0,1498222484436


In [32]:
data.columns

Index(['pressureDerivative', ' pressureIQR', ' timestamp ', 'prediction',
       ' timestamp '],
      dtype='object')

In [33]:
data_clean = data[['pressureDerivative', ' pressureIQR','prediction']]
data_clean.head()

Unnamed: 0,pressureDerivative,pressureIQR,prediction
0,-2.035256,1.695796,0
1,-1.107091,1.26496,0
2,-0.652795,-0.095003,0
3,-0.263341,-0.778249,0
4,0.099831,-0.887046,0


In [35]:
pd.value_counts(data_clean['prediction'])

1    174
0     38
Name: prediction, dtype: int64

In [40]:
X = data_clean[['pressureDerivative', ' pressureIQR']].values
X.shape

(212, 2)

In [56]:
Y = data_clean['prediction'].values
Y = Y.reshape(-1,1)
Y.shape

(212, 1)

In [57]:
print(X[:5])
print(Y[:5])

[[-2.03525571  1.69579643]
 [-1.10709071  1.26496003]
 [-0.65279495 -0.09500334]
 [-0.26334071 -0.77824894]
 [ 0.09983143 -0.88704601]]
[[0]
 [0]
 [0]
 [0]
 [0]]


## Simple Classification

In [47]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [65]:
kf = KFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in kf.split(X):
    XTrain, XTest, YTrain, YTest = X[train], X[test], Y[train], Y[test]
    clf = RandomForestClassifier(n_estimators=500, random_state=seed )
    clf = clf.fit(XTrain, YTrain.ravel())
    YPred = clf.predict(XTest)
#     print(YPred.shape)
    diff = YPred - YTest.ravel()
#     print(diff.shape)
#     print(diff)
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

34 43
79.06976744186046
38 43
88.37209302325581
41 42
97.61904761904762
36 42
85.71428571428571
39 42
92.85714285714286


In [79]:
kf = KFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in kf.split(X):
    XTrain, XTest, YTrain, YTest = X[train], X[test], Y[train], Y[test]
    clf = svm.SVC(kernel='linear')
    clf = clf.fit(XTrain, YTrain.ravel())
    YPred = clf.predict(XTest)
#     print(YPred.shape)
    diff = YPred - YTest.ravel()
#     print(diff.shape)
#     print(diff)
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

35 43
81.3953488372093
39 43
90.69767441860465
39 42
92.85714285714286
36 42
85.71428571428571
42 42
100.0


In [90]:
kf = KFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in kf.split(X):
    XTrain, XTest, YTrain, YTest = X[train], X[test], Y[train], Y[test]
    clf = KNeighborsClassifier(n_neighbors=3)
    clf = clf.fit(XTrain, YTrain.ravel())
    YPred = clf.predict(XTest)
#     print(YPred.shape)
    diff = YPred - YTest.ravel()
#     print(diff.shape)
#     print(diff)
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

34 43
79.06976744186046
39 43
90.69767441860465
39 42
92.85714285714286
36 42
85.71428571428571
39 42
92.85714285714286


## Stratified Cross Validation

In [71]:
skf = StratifiedKFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in skf.split(X, Y.ravel()):
    XTrain, XTest, YTrain, YTest = X[train], X[test], Y[train], Y[test]
    clf = RandomForestClassifier(n_estimators=500, random_state=seed )
    clf = clf.fit(XTrain, YTrain.ravel())
    YPred = clf.predict(XTest)
    diff = YPred - YTest.ravel()
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

36 43
83.72093023255815
38 43
88.37209302325581
40 43
93.02325581395348
38 42
90.47619047619048
37 41
90.2439024390244


In [80]:
skf = StratifiedKFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in skf.split(X, Y.ravel()):
    XTrain, XTest, YTrain, YTest = X[train], X[test], Y[train], Y[test]
    clf = svm.SVC(kernel='linear')
    clf = clf.fit(XTrain, YTrain.ravel())
    YPred = clf.predict(XTest)
    diff = YPred - YTest.ravel()
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

37 43
86.04651162790698
38 43
88.37209302325581
39 43
90.69767441860465
39 42
92.85714285714286
38 41
92.6829268292683


In [89]:
skf = StratifiedKFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in skf.split(X, Y.ravel()):
    XTrain, XTest, YTrain, YTest = X[train], X[test], Y[train], Y[test]
    clf = KNeighborsClassifier(n_neighbors=3)
    clf = clf.fit(XTrain, YTrain.ravel())
    YPred = clf.predict(XTest)
    diff = YPred - YTest.ravel()
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

36 43
83.72093023255815
38 43
88.37209302325581
39 43
90.69767441860465
37 42
88.0952380952381
36 41
87.8048780487805


## One Hot Encoded Outputs

In [73]:
ohe = OneHotEncoder()
YO = ohe.fit_transform(Y)
print(YO.shape)

(212, 2)


In [76]:
YO = YO.todense()
YO.shape

(212, 2)

In [77]:
kf = KFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in kf.split(X):
    XTrain, XTest, YTrain, YTest = X[train], X[test], YO[train], YO[test]
    clf = RandomForestClassifier(n_estimators=500, random_state=seed )
    clf = clf.fit(XTrain, YTrain)
    YPred = clf.predict(XTest)
#     print(YPred.shape)
    diff = YPred - YTest
#     print(diff.shape)
#     print(diff)
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

68 86
79.06976744186046
76 86
88.37209302325581
82 84
97.61904761904762
72 84
85.71428571428571
78 84
92.85714285714286


In [91]:
kf = KFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in kf.split(X):
    XTrain, XTest, YTrain, YTest = X[train], X[test], YO[train], YO[test]
    clf = KNeighborsClassifier(n_neighbors=3)
    clf = clf.fit(XTrain, YTrain)
    YPred = clf.predict(XTest)
#     print(YPred.shape)
    diff = YPred - YTest
#     print(diff.shape)
#     print(diff)
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

68 86
79.06976744186046
78 86
90.69767441860465
78 84
92.85714285714286
72 84
85.71428571428571
78 84
92.85714285714286


In [82]:
from sklearn.svm import LinearSVC
kf = KFold(n_splits=5)
# print(list(kf.split(X)))
# print(list(kf.split(Y)))
for train, test in kf.split(X):
    XTrain, XTest, YTrain, YTest = X[train], X[test], YO[train], YO[test]
    clf = LinearSVC(multi_class='crammer_singer')
    clf = clf.fit(XTrain, YTrain)
    YPred = clf.predict(XTest)
#     print(YPred.shape)
    diff = YPred - YTest
#     print(diff.shape)
#     print(diff)
    score = diff[diff == 0].size
    print(score, YPred.size)
    print((100.0 * score)/(YPred.size))

ValueError: bad input shape (169, 2)