In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
import time
from IPython.display import display 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

In [3]:
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
from sklearn import tree

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import RandomForestClassifier,  AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [5]:
dt = pd.read_pickle(r'data\data.pkl')

In [6]:
activities = dt.groupby('activity')

Just keep 4 activities



In [7]:
vals = ['standing', 'walking','running','cycling']
data = dt.loc[dt['activity'].isin(vals)].copy()

In [8]:
data.activity.value_counts()

walking     229709
standing    188984
cycling     163302
running      95641
Name: activity, dtype: int64

Cut off first and last 1000 items, because activity starts end ends



In [9]:
data.drop(data[data.activity=='running'].iloc[:1000].index, inplace=True)
data.drop(data[data.activity=='running'].iloc[-1000:].index, inplace=True)
data.drop(data[data.activity=='walking'].iloc[:1000].index, inplace=True)
data.drop(data[data.activity=='walking'].iloc[-1000:].index, inplace=True)
data.drop(data[data.activity=='cycling'].iloc[:1000].index, inplace=True)
data.drop(data[data.activity=='cycling'].iloc[-1000:].index, inplace=True)
data.drop(data[data.activity=='standing'].iloc[:1000].index, inplace=True)
data.drop(data[data.activity=='standing'].iloc[-1000:].index, inplace=True)

In [10]:
data.activity.value_counts()

walking     227709
standing    186984
cycling     161302
running      93641
Name: activity, dtype: int64

# Features

## Absolute Acceleration


absolute acceleration: $|a|=\sqrt{a_x^2 + a_y^2 + a_z^2}$

to get rid of the orientation of the device

In [11]:
def absacc(row):
    return np.sqrt(row['IMU_chest_ax1']**2 + row['IMU_chest_ay1']**2 + row['IMU_chest_az1']**2)/9.806

In [12]:
data['absacc'] = data.apply(absacc, axis=1)

## Max-Min Difference of absolute Acceleration

In [13]:
ws=512
dt = 1.0/100.0 # the activities were with 50Hz


In [14]:
data['accmax'] =  data['absacc'].rolling(window=ws,center=False).max() 
data['accmin'] = data['absacc'].rolling(window=ws,center=False).min() 

data['accmaxmindiff'] = data.accmax - data.accmin

## Fourier Transform of Rotation Rates


In [15]:
def fft_amplitude(s, kind='peak'):
    
    # don't forget the windowing to get rid of the leakage effect
    hann = np.hanning(len(s)) 
    
    # do the FFT with Hanning Window
    Yhann = np.fft.fft(hann*s)
    
    N = int(len(Yhann)/2+1)
    Y = 2*(np.abs(Yhann[:N])/N) # right half is enough info(positive freqs only)
    
    # frequency axis, if needed
    fa = 1.0/dt
    f = np.linspace(0, fa/2.0, N, endpoint=True)
    
    if kind=='peak':
        return np.max(Y) # just return the maximum peak amplitude
    elif kind=='periodicity':
        return np.max(Y) / np.mean(Y) # return periodicity
    elif kind=='full':
        return f, Y # return the full spectrum

In [16]:
data['fftamppeak'] = data['IMU_chest_rotz'].rolling(window=1*ws,center=False).apply(fft_amplitude, raw=False)


Because of the rolling_ functions, there is overlap between the activity features and the labels, corresponding to it. We have to delete some rows (length of window), before using a classifier.



In [17]:
data.drop(data[data.activityID==3].iloc[0:int(ws)-1].index, inplace=True)
data.drop(data[data.activityID==4].iloc[0:int(ws)-1].index, inplace=True)
data.drop(data[data.activityID==5].iloc[0:int(ws)-1].index, inplace=True)
data.drop(data[data.activityID==6].iloc[0:int(ws)-1].index, inplace=True)

In [18]:
data.head()

Unnamed: 0,activityID,activity,heartrate,IMU_hand_temp,IMU_hand_ax1,IMU_hand_ay1,IMU_hand_az1,IMU_hand_ax2,IMU_hand_ay2,IMU_hand_az2,...,IMU_ankle_roty,IMU_ankle_rotz,IMU_ankle_magx,IMU_ankle_magy,IMU_ankle_magz,absacc,accmax,accmin,accmaxmindiff,fftamppeak
55106,3,standing,101,32.6875,0.549558,7.3542,6.75881,0.48419,7.79716,6.62053,...,0.01035,-0.053009,-88.223999,32.996899,-4.05403,1.003447,1.048333,0.963532,0.084801,0.146737
55107,3,standing,101,32.6875,0.513651,6.86262,6.91388,0.588009,7.3285,6.89326,...,0.040042,-0.028382,-87.089302,32.827202,-2.57978,0.999723,1.048333,0.963532,0.084801,0.146762
55108,3,standing,101,32.6875,0.851353,6.7079,6.88097,0.707032,6.95026,7.09028,...,-0.024165,-0.043116,-87.096298,32.838799,-3.44605,1.005541,1.048333,0.963532,0.084801,0.146781
55109,3,standing,101,32.6875,0.765876,6.81962,6.57141,0.750087,6.72339,7.06053,...,0.006977,-0.037264,-87.5457,32.640499,-3.9396,1.002116,1.048333,0.963532,0.084801,0.146793
55110,3,standing,101,32.6875,0.798051,6.78005,6.41808,0.853185,6.63185,6.86432,...,0.012983,0.010073,-87.654099,32.099201,-4.4396,1.003846,1.048333,0.963532,0.084801,0.146798


In [19]:
data.to_pickle(r'data\data_feats.pkl')

# Classification

Each ensemble algorithm is demonstrated using 10 fold cross validation, a standard technique used to estimate the performance of any machine learning algorithm on unseen data.

In [20]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
num_trees = 100
max_features = 2

In [21]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(n_neighbors =3),
    #"Linear SVM": SVC(kernel='rbf', C=1.0, gamma=0.5),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest10": RandomForestClassifier(n_estimators=10),
    "Neural Net": MLPClassifier(alpha = 1),
    "Naive Bayes": GaussianNB(),
    
    #Bagging Algos
    "Bagged Decision Trees": BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), n_estimators=num_trees, random_state=seed),
    "RF": RandomForestClassifier(n_estimators=num_trees, max_features=max_features),
    "Extra Trees": ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features),
    #Boosting Algos
    "AdaBoost": AdaBoostClassifier(n_estimators=num_trees, random_state=seed),
    "Stochastic Gradient Boosting": GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
        
}

In [22]:
def get_train_test(df, y_col, x_cols, ratio):
    """ 
    This method transforms a dataframe into a train and test set, for this you need to specify:
    1. the ratio train : test (usually 0.7)
    2. the column with the Y_values
    """
    mask = np.random.rand(len(df)) < ratio
    df_train = df[mask]
    df_test = df[~mask]
       
    Y_train = df_train[y_col].values
    Y_test = df_test[y_col].values
    X_train = df_train[x_cols].values
    X_test = df_test[x_cols].values
    return df_train, df_test, X_train, Y_train, X_test, Y_test



In [23]:
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 5, verbose = True):
    """
    This method, takes as input the X, Y matrices of the Train and Test set.
    And fits them on all of the Classifiers specified in the dict_classifier.
    The trained models, and accuracies are saved in a dictionary. The reason to use a dictionary
    is because it is very easy to save the whole dictionary with the pickle module.
    
    Usually, the SVM, Random Forest and Gradient Boosting Classifier take quiet some time to train. 
    So it is best to train them on a smaller dataset first and 
    decide whether you want to comment them out or not based on the test accuracy score.
    """
    
    dict_models = {}
    for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        t_start = time.process_time()
        classifier.fit(X_train, Y_train)
        t_end = time.process_time()
        t_diff = t_end - t_start
        
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        Y_true = classifier.predict(X_test)
        accuracy = accuracy_score(Y_true, Y_test)
        
        dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff, 'y_true': Y_true, 'accuracy_score': accuracy, 'y_pred': Y_test}
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))
    return dict_models

In [24]:
def display_dict_models(dict_models, sort_by='test_score'):
    cls = [key for key in dict_models.keys()]
    test_s = [dict_models[key]['test_score'] for key in cls]
    training_s = [dict_models[key]['train_score'] for key in cls]
    training_t = [dict_models[key]['train_time'] for key in cls]
    accuracy_s = [dict_models[key]['accuracy_score'] for key in cls]
    
    df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),5)), columns = ['classifier', 'train_score', 'test_score', 'train_time','accuracy_score'])
    for ii in range(0,len(cls)):
        df_.loc[ii, 'classifier'] = cls[ii]
        df_.loc[ii, 'train_score'] = training_s[ii]
        df_.loc[ii, 'test_score'] = test_s[ii]
        df_.loc[ii, 'train_time'] = training_t[ii]
        df_.loc[ii, 'accuracy_score'] = accuracy_s[ii]*100
    
    display(df_.sort_values(by=sort_by, ascending=False))

In [25]:
labels = data['activity'].values
np.shape(labels)

(667592,)

### Classification with Features

In [26]:
featurevector = ['accmaxmindiff','fftamppeak']

features = data[featurevector].values
np.shape(features)

(667592, 2)

In [27]:
y_col = 'activity'

train_test_ratio = 0.7

df_train, df_test, features_train, labels_train, features_test, labels_test = get_train_test(data, y_col, featurevector, train_test_ratio)

In [28]:
#features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.4, random_state=42)

In [29]:
dict_models = batch_classify(features_train, labels_train, features_test, labels_test, no_classifiers = 11)

trained Logistic Regression in 2.64 s
trained Nearest Neighbors in 0.94 s
trained Decision Tree in 2.95 s
trained Random Forest10 in 10.16 s
trained Neural Net in 134.00 s
trained Naive Bayes in 0.83 s
trained Bagged Decision Trees in 131.84 s
trained RF in 146.53 s
trained Extra Trees in 41.92 s
trained AdaBoost in 93.62 s
trained Stochastic Gradient Boosting in 226.42 s


In [30]:
display_dict_models(dict_models)

Unnamed: 0,classifier,train_score,test_score,train_time,accuracy_score
8,Extra Trees,1.0,0.997704,41.921875,99.770372
6,Bagged Decision Trees,0.999996,0.996995,131.84375,99.699487
7,RF,0.999996,0.99693,146.53125,99.692997
2,Decision Tree,1.0,0.996401,2.953125,99.640083
3,Random Forest10,0.999581,0.991504,10.15625,99.150376
1,Nearest Neighbors,0.989255,0.982943,0.9375,98.294263
10,Stochastic Gradient Boosting,0.945783,0.945738,226.421875,94.57379
9,AdaBoost,0.91724,0.917154,93.625,91.715421
4,Neural Net,0.915639,0.915672,134.0,91.567161
5,Naive Bayes,0.875412,0.875092,0.828125,87.509235


In [31]:
import pickle
# Store data (serialize)
with open(r'data\2featclass.pickle', 'wb') as handle:
    pickle.dump(dict_models, handle)

# Load data (deserialize)
#with open('2featclass.pickle', 'rb') as handle:
 #   unserialized_data = pickle.load(handle)

### Classification without Features

In [32]:
x_cols = list(data.columns.values)
unwanted = {'activity','activityID','absacc','accmin','accmax','accmaxmindiff','fftamppeak'}
x_cols = [e for e in x_cols if e not in unwanted]
dt = data[x_cols].values
np.shape(dt)

(667592, 40)

In [33]:
df_train, df_test, x_train, y_train, x_test, y_test = get_train_test(data, y_col, x_cols, train_test_ratio)

In [34]:
#x_train, x_test, y_train, y_test = train_test_split(dt, labels, test_size=0.4, random_state=42)

In [35]:
dict_models1 = batch_classify(x_train, y_train, x_test, y_test, no_classifiers = 11)
display_dict_models(dict_models1)



trained Logistic Regression in 557.06 s
trained Nearest Neighbors in 4.05 s
trained Decision Tree in 51.94 s
trained Random Forest10 in 53.86 s
trained Neural Net in 324.31 s
trained Naive Bayes in 0.94 s
trained Bagged Decision Trees in 4087.00 s
trained RF in 201.14 s
trained Extra Trees in 61.23 s
trained AdaBoost in 418.14 s
trained Stochastic Gradient Boosting in 1178.33 s


Unnamed: 0,classifier,train_score,test_score,train_time,accuracy_score
8,Extra Trees,1.0,0.999995,61.234375,99.9995
7,RF,1.0,0.999965,201.140625,99.996502
1,Nearest Neighbors,0.999981,0.99992,4.046875,99.992004
3,Random Forest10,1.0,0.999885,53.859375,99.988506
10,Stochastic Gradient Boosting,0.99994,0.999885,1178.328125,99.988506
6,Bagged Decision Trees,1.0,0.999615,4087.0,99.961518
2,Decision Tree,1.0,0.999085,51.9375,99.908544
4,Neural Net,0.996616,0.996847,324.3125,99.684651
0,Logistic Regression,0.980295,0.980869,557.0625,98.086918
5,Naive Bayes,0.955801,0.955596,0.9375,95.559631


In [36]:
#classifier.fit(x_train, y_train)
#train_score = classifier.score(x_train, y_train)
#test_score = classifier.score(x_test, y_test)
#Y_true = classifier.predict(x_test)
#accuracy = accuracy_score(Y_true, y_test)
#print(accuracy)

In [38]:
# Store data (serialize)
with open(r'data\class.pickle', 'wb') as handle:
    pickle.dump(dict_models, handle)

# Load data (deserialize)
#with open('2featclass.pickle', 'rb') as handle:
 #   unserialized_data = pickle.load(handle)