In [1]:
import numpy as np 
import pandas as pd 
import os 
import scipy.stats
from tqdm.notebook import tqdm

In [2]:
classes = ['running', 'idle', 'walking', 'stairs']

In [3]:
# read each file separatelly 
def read_file(path_to_file):
    data = pd.read_csv(path_to_file)
    return data

###  Generate some numeric statictical features

In [4]:
def calculate_simple_features(data):
    # columns  ['accelerometer_X'...]
    new_line = pd.DataFrame(columns=['min_x'])
    new_line.loc[0,'min_x'] = np.min(data['accelerometer_X'])
    new_line.loc[0,'mean_x'] = np.mean(data['accelerometer_X'])
    new_line.loc[0, 'max_x'] = np.max(data['accelerometer_X'])
    new_line.loc[0,'idr_x'] = scipy.stats.iqr(data['accelerometer_X'])
    new_line.loc[0, 'median_x'] = np.median(data['accelerometer_X'])
    new_line.loc[0,'std_x'] = np.std(data['accelerometer_X'])
    new_line.loc[0, 'curt_x'] = scipy.stats.kurtosis(data['accelerometer_X'])
    new_line.loc[0,'min_y'] = np.min(data['accelerometer_Y'])
    new_line.loc[0,'mean_y'] = np.mean(data['accelerometer_Y'])
    new_line.loc[0, 'max_y'] = np.max(data['accelerometer_Y'])
    new_line.loc[0,'idr_y'] = scipy.stats.iqr(data['accelerometer_Y'])
    new_line.loc[0, 'median_y'] = np.median(data['accelerometer_Y'])
    new_line.loc[0,'std_y'] = np.std(data['accelerometer_Y'])
    new_line.loc[0, 'curt_y'] = scipy.stats.kurtosis(data['accelerometer_Y'])
    new_line.loc[0,'min_z'] = np.min(data['accelerometer_Z'])
    new_line.loc[0,'mean_z'] = np.mean(data['accelerometer_Z'])
    new_line.loc[0, 'max_z'] = np.max(data['accelerometer_Z'])
    new_line.loc[0,'idr_z'] = scipy.stats.iqr(data['accelerometer_Z'])
    new_line.loc[0, 'median_z'] = np.median(data['accelerometer_Z'])
    new_line.loc[0,'std_z'] = np.std(data['accelerometer_Z'])
    new_line.loc[0, 'curt_z'] = scipy.stats.kurtosis(data['accelerometer_Z'])
    return new_line

In [5]:
# Process data in file and return it as numpy array
def process_file_data(data):
    new_data = calculate_simple_features(data) 
    return new_data.to_numpy().flatten()

In [6]:
full_data = pd.DataFrame(columns=['class', 'samples'])
for class_name in classes: 
    folder_name = os.path.join('data', class_name)
    class_samples = os.listdir(folder_name)
    for sample in tqdm(class_samples):
        if not sample.startswith('.'): 
            sample_data = read_file(os.path.join(folder_name, sample))
            np_sample = process_file_data(sample_data)
            full_data = full_data.append({'class':class_name, 'samples':np_sample}, ignore_index=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3408.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1039.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1850.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=165.0), HTML(value='')))




In [7]:
full_data.head(10)

Unnamed: 0,class,samples
0,running,"[-5.295974, 7.409415866666667, 27.087997, 10.5..."
1,running,"[-7.656657000000001, 5.468037566666667, 23.879..."
2,running,"[-5.2241480000000005, 7.893204099999999, 38.19..."
3,running,"[-8.729258999999999, 6.006892633333335, 27.092..."
4,running,"[-7.522581, 8.3963057, 38.19709, 14.747085, 5...."
5,running,"[-4.0845080000000005, 5.823337066666667, 22.79..."
6,running,"[-39.188293, -6.708233933333332, 10.357317, 13..."
7,running,"[-9.308656, 5.2417059, 28.88365, 16.300922, 4...."
8,running,"[-8.944737, 3.9925710333333337, 28.88365, 13.3..."
9,running,"[-27.207708, -1.9991583666666661, 14.283808, 1..."


In [8]:
full_data.tail(10)

Unnamed: 0,class,samples
6452,stairs,"[-11.238382, -3.2609025333333332, 3.969586, 5...."
6453,stairs,"[-10.036493, -3.4401484666666664, 2.987964, 3...."
6454,stairs,"[-4.76925, 1.2304601666666668, 5.88016, 2.8526..."
6455,stairs,"[-9.528923, -0.44659826666666647, 5.272033, 5...."
6456,stairs,"[-7.426813, -0.1813206666666666, 5.272033, 4.8..."
6457,stairs,"[-7.426813, -0.2681503333333332, 5.272033, 4.8..."
6458,stairs,"[-9.528923, -0.4102063999999999, 5.272033, 5.2..."
6459,stairs,"[-4.76925, 1.0251972999999999, 5.88016, 4.1084..."
6460,stairs,"[-10.036493, -3.485957566666666, 2.987964, 3.4..."
6461,stairs,"[-11.238382, -3.4840421333333333, 3.969586, 4...."


### Now, as we have the full dataset created, we can make simple analysis. Let's check number of samples per class. 

In [9]:
# Number of data per class
full_data['class'].value_counts()

running    3408
walking    1850
idle       1039
stairs      165
Name: class, dtype: int64

As we can see the dataset is unbalanced, with majority of samples in 'running' and the smallest amount of data in 'stairs'.

### Now we need to create train / test split

In [10]:
train=full_data.sample(frac=0.8,random_state=200) #random state is a seed value
test=full_data.drop(train.index)

In [11]:
train['class'].value_counts()

running    2726
walking    1464
idle        850
stairs      130
Name: class, dtype: int64

In [12]:
test['class'].value_counts()

running    682
walking    386
idle       189
stairs      35
Name: class, dtype: int64

In [13]:
# Save train/test data 
train.to_csv('train.csv')
test.to_csv('test.csv')

In [14]:
X = np.array(train['samples'].tolist())
y = train['class'].factorize()[0]

In [15]:
X_test = np.array(test['samples'].tolist())
y_test = test['class'].factorize()[0]

### Gaussian Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [17]:
model = GaussianNB()

In [18]:
model.fit(X, y)

GaussianNB()

In [21]:
y_train_model = model.predict(X)

In [22]:
accuracy_score(y, y_train_model)

0.9849129593810445

In [19]:
y_model = model.predict(X_test)

In [20]:
accuracy_score(y_test, y_model)

0.02631578947368421

We have a good classification accuracy on train, but it's terrible on test. 

### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [49]:
X, y = make_classification(n_samples=1000, n_features=21,
                           n_redundant=0,
                           random_state=0, shuffle=False)

In [50]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [51]:
clf.fit(X, y)

RandomForestClassifier(max_depth=2, random_state=0)

In [52]:
y_test_model = clf.predict(X_test)

In [53]:
accuracy_score(y_test, y_test_model)

0.19659442724458204

In [54]:
y_train_model = model.predict(X)

In [55]:
accuracy_score(y, y_train_model)

0.239

Here the results are worse :( 