# Human Activity Recognition using smartphones

The Human Activity Recognition database was built from the recordings of 30 study participants performing activities of daily living (ADL) while carrying a waist-mounted smartphone with embedded inertia sensors. The objective of this project is to classify activities into one of the six activities performed.

The classification models used are:
1. Logistic Regression
2. Random Forest Classifier
3. KNN
4. Decision Tree with GridSearchCV

In [1]:
#importing libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
# Import train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING


In [6]:
# Check counts of Activities in train data
train.Activity.value_counts()

WALKING               209
STANDING              179
LAYING                164
WALKING_UPSTAIRS      159
WALKING_DOWNSTAIRS    145
SITTING               143
Name: Activity, dtype: int64

In [7]:
# Check counts of Activities in test data
test.Activity.value_counts()

WALKING               185
LAYING                183
STANDING              178
SITTING               170
WALKING_UPSTAIRS      149
WALKING_DOWNSTAIRS    134
Name: Activity, dtype: int64

In [8]:
#Check shape of data
train.shape, test.shape

((999, 563), (999, 563))

In [9]:
# Check the columns of train data
train.columns

Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
       'tBodyAcc-max()-X',
       ...
       'fBodyBodyGyroJerkMag-kurtosis()', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'subject', 'Activity'],
      dtype='object', length=563)

In [10]:
#Summary of train set
train.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,0.272522,-0.017315,-0.106699,-0.564767,-0.421911,-0.601705,-0.596136,-0.439798,-0.591796,-0.411679,...,-0.219581,-0.541213,0.015125,0.002728,0.004282,-0.013146,-0.545935,0.058899,-0.03347,2.936937
std,0.070183,0.041918,0.056029,0.428018,0.501715,0.340713,0.40018,0.486153,0.351317,0.530312,...,0.351241,0.360838,0.349059,0.469766,0.629273,0.481251,0.471809,0.349956,0.168279,1.636577
min,-0.361205,-0.684097,-1.0,-0.9993,-0.998359,-0.999454,-0.999407,-0.998077,-0.999808,-0.971348,...,-0.944282,-0.999595,-0.939598,-0.976454,-0.995222,-0.994877,-1.0,-0.875487,-0.980143,1.0
25%,0.258468,-0.025925,-0.122726,-0.990822,-0.968894,-0.973927,-0.992207,-0.970928,-0.972885,-0.933391,...,-0.476273,-0.815733,-0.144342,-0.31622,-0.51215,-0.392773,-0.795959,0.030145,-0.103119,1.0
50%,0.277054,-0.017185,-0.108829,-0.464909,-0.208523,-0.486962,-0.509215,-0.242165,-0.465163,-0.297181,...,-0.264434,-0.629299,0.010903,0.017954,0.012891,-0.016025,-0.7173,0.223164,0.030593,3.0
75%,0.290635,-0.007523,-0.093717,-0.234604,0.04561,-0.314822,-0.289766,0.012828,-0.302503,-0.008596,...,-0.015262,-0.361251,0.178357,0.332586,0.538985,0.350713,-0.606748,0.281283,0.082679,5.0
max,0.498177,0.32413,0.346658,0.543347,0.532506,0.364114,0.495926,0.50226,0.554965,0.680338,...,0.989538,0.956845,0.955207,0.998425,0.994519,0.971511,0.799174,0.385117,0.265795,6.0


## Logistic Regression

In [26]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

Use a funcion to avoid importing the data again

In [27]:
def get_all_data():
    train_values = train_df.values
    test_values = test_df.values
    np.random.shuffle(train_values)
    np.random.shuffle(test_values)
    X_train = train_values[:, :-1]
    X_test = test_values[:, :-1]
    y_train = train_values[:,-1]
    y_test = test_values[:, -1]
    return X_train, X_test, y_train, y_test

In [28]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = get_all_data()

In [29]:
model = LogisticRegression()
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
#Fit model
model.fit(X_test, y_test)

#Get score
model.score(X_test, y_test)

0.997997997997998

The Logistic Regression produces an accuracy of 99.8%

## Principal Component Analysis and Feature Scaling

In [38]:
from sklearn.decomposition import PCA
X_train, X_test, y_train, y_test = get_all_data()

#Instantiate class object
pca = PCA(n_components=200)

#Fit to data
pca.fit(X_train)

print(pca.components_)


[[ 1.69050781e-04 -4.44212412e-04 -3.51434954e-05 ...  3.60473015e-02
   1.81230971e-02 -1.82097540e-02]
 [ 7.38946061e-04  7.15791678e-04  1.12505270e-03 ... -1.29486130e-02
   4.67550235e-03  8.52259563e-01]
 [-7.34071586e-03  7.09003443e-04  2.29145703e-03 ...  6.28070119e-02
   1.46603696e-02 -2.78034066e-01]
 ...
 [ 4.56854439e-02  1.09271844e-02  2.75511899e-02 ...  1.36408623e-02
  -8.98290821e-03  8.72882158e-04]
 [-2.02503989e-02  1.84064934e-02  3.54089846e-02 ...  9.70384512e-03
   4.72168458e-03 -4.82442096e-03]
 [ 1.90287517e-02  3.32159030e-02  1.45884106e-02 ...  3.01440464e-02
  -1.43562194e-02 -1.04735495e-03]]


In [33]:
print(pca.explained_variance_)

[3.26828494e+01 3.29185944e+00 2.81964845e+00 1.82417022e+00
 9.17162621e-01 7.54880860e-01 6.72620154e-01 5.69026191e-01
 5.46259311e-01 4.56198804e-01 4.33537144e-01 3.87110814e-01
 3.30682413e-01 3.17879118e-01 2.76705101e-01 2.61502406e-01
 2.58752305e-01 2.44371533e-01 2.30450839e-01 2.06005508e-01
 2.03844683e-01 1.88636504e-01 1.80148115e-01 1.74783897e-01
 1.67164409e-01 1.61293153e-01 1.55676548e-01 1.45678053e-01
 1.41094924e-01 1.31585503e-01 1.30438282e-01 1.26758072e-01
 1.22143328e-01 1.17014773e-01 1.10498787e-01 1.07778478e-01
 1.06850513e-01 1.03765349e-01 9.76851413e-02 9.60614644e-02
 9.34307746e-02 8.97794761e-02 8.44509467e-02 8.30375096e-02
 7.70234556e-02 7.61351491e-02 7.36644054e-02 7.14232569e-02
 6.98479559e-02 6.63275471e-02 6.56889385e-02 6.25868123e-02
 6.03134809e-02 5.85536001e-02 5.77508266e-02 5.56442337e-02
 5.34982119e-02 5.23770104e-02 5.17796152e-02 4.98879866e-02
 4.90057409e-02 4.75740001e-02 4.60806834e-02 4.53941644e-02
 4.45880742e-02 4.303311

In [39]:
#Transform model
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

#Fit to model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8798798798798799

## Feature scaling

In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train, X_test, y_train, y_test = get_all_data()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8408408408408409

## Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = get_all_data()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators = 500)

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9059059059059059

## KNN

In [43]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#Separate features and labels
X_train = train.drop('Activity', axis = 1).values
y_train = train.Activity.values

X_test = test.drop('Activity', axis = 1).values
y_test = test.Activity.values

#Encoding y
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

#Encode y_train
encoder.fit(y_train)
y_train_e = encoder.transform(y_train)

#Encode y_test
encoder.fit(y_test)
y_test_e = encoder.transform(y_test)

In [46]:
clf = KNeighborsClassifier(n_neighbors=24)

knn = clf.fit(X_train, y_train_e)
y_preds=clf.predict(X_test)

acc = accuracy_score(y_test_e, y_preds)
print('KNN accuracy: %.2f' % (acc))

KNN accuracy: 0.76


## Decision Tree 

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dtc = DecisionTreeClassifier()

tree = dtc.fit(X_train, y_train_e)
y_preds=dtc.predict(X_test)

acc = accuracy_score(y_test_e, y_preds)
print('Decision Tree accuracy: %.2f'% (acc))

Decision Tree accuracy: 0.75


Classification models
1. Logistic regression : 0.99
2. Random forest : 0.91
3. KNN accuracy : 0.76
4. Decision Tree Accuracy : 0.75

End of Notebook!!!