In [28]:
import pandas as pd
import numpy as np
import librosa
import sklearn
import sklearn.model_selection
import skl2onnx
from datetime import datetime
import time
import os

In [35]:
def feature_extraction(data, sr=100, window_length = 256, hop_length = 128):
    
    extracted_features_frame = pd.DataFrame()
    for col in data.columns:
        zcr = librosa.feature.zero_crossing_rate(y=np.array(data[col], dtype=np.float64),
                                                                        frame_length=window_length, 
                                                                        hop_length=hop_length)
    
        rms = librosa.feature.rms(y=np.array(data[col], dtype=np.float64),
                                                        frame_length=window_length, 
                                                        hop_length=hop_length)
        
        extracted_features_frame[col + " RMS"] = rms.squeeze()
        extracted_features_frame[col + " ZCR"] = zcr.squeeze()
    
    return extracted_features_frame


def save_metrics(metrics_path,date_and_time,model_name,accuracy_score,c_val,gamma,kernel,rows,time,seed):
    metrics_dict = {
    'Date Time': date_and_time,
    'Name': model_name, 
    'Accuracy': accuracy_score, 
    'C': c_val,
    'Gamma': gamma,
    'Kernel': kernel,
    'Rows': rows,
    'Training Time': total_training_time,
    'Seed': seed
    }    

    metrics_frame = pd.DataFrame(columns=[
        'Date Time',
        'Name', 
        'Accuracy', 
        'C',
        'Gamma',
        'Kernel',
        'Rows',
        'Training Time', 
        'Seed'
        ])

    # Check if metrics csv exists
    metrics_frame = pd.DataFrame(metrics_dict, index=[0])

    if (os.path.exists(metrics_path)):
        df = pd.read_csv(metrics_path, index_col=0)
        df = pd.concat([df, metrics_frame], ignore_index=True)
        df.to_csv(metrics_path)

    else:
        metrics_frame.to_csv(metrics_path)

In [3]:
data_path = "./data/"

# PRE PROCESSING

# These are irrelevant for the current task due to being idle. We remove them to speed up the training process.
cols_to_ignore = ["Timestamp", "D-Pad", "Touch", "L3", "R3", "L1", "R1", "L2"]

# Filter out slow start and finish, ensure same size.
start_slice = 500
end_slice = 5500

labels_dict = {
    0 : "idle",
    1 : "low_activity",
    2 : "medium_activity",
    3 : "high_activity"
}

# Raw Data
idle_frame_raw = pd.read_csv(data_path + "idle.csv").iloc[start_slice:end_slice]
idle_labels_raw = np.zeros(idle_frame_raw.shape[0])

low_activity_frame_raw = pd.read_csv(data_path + "low_activity.csv").iloc[start_slice:end_slice]
low_activity_labels_raw = np.zeros(low_activity_frame_raw.shape[0]) + 1

medium_activity_frame_raw = pd.read_csv(data_path + "medium_activity.csv").iloc[start_slice:end_slice]
medium_activity_labels_raw = np.zeros(medium_activity_frame_raw.shape[0]) + 2

high_activity_frame_raw = pd.read_csv(data_path + "high_activity.csv").iloc[start_slice:end_slice]
high_activity_labels_raw = np.zeros(high_activity_frame_raw.shape[0]) + 3

# Feature Extract
idle_frame_features = feature_extraction(idle_frame_raw)
idle_labels_features = np.zeros(idle_frame_features.shape[0])

low_activity_frame_features = feature_extraction(low_activity_frame_raw)
low_activity_labels_features = np.zeros(low_activity_frame_features.shape[0]) + 1

medium_activity_frame_features = feature_extraction(medium_activity_frame_raw)
medium_activity_labels_features = np.zeros(medium_activity_frame_features.shape[0]) + 2

high_activity_frame_features = feature_extraction(high_activity_frame_raw)
high_activity_labels_features = np.zeros(high_activity_frame_features.shape[0]) + 3

# Concatenate Raw Dataset
labels_raw = np.concatenate((idle_labels_raw, low_activity_labels_raw, medium_activity_labels_raw, high_activity_labels_raw))
data_raw = pd.concat((idle_frame_raw, low_activity_frame_raw, medium_activity_frame_raw, high_activity_frame_raw))
data_raw.reset_index(drop=True, inplace=True)
# Concatenate Extracted Features Dataset
labels_features = np.concatenate((idle_labels_features, low_activity_labels_features, medium_activity_labels_features, high_activity_labels_features))
data_features = pd.concat((idle_frame_features, low_activity_frame_features, medium_activity_frame_features, high_activity_frame_features))
data_features.reset_index(drop=True, inplace=True)

# Remove unwanted features
for col in data_raw.columns:
    for header in cols_to_ignore:
        if header in col:
            data_raw = data_raw.drop(col, axis=1)
            
for col in data_features.columns:
    for header in cols_to_ignore:
        if header in col:
            data_features = data_features.drop(col, axis=1)
            
            
# There is a leading whitespace in each header. This removes it.
for col in data_raw.columns:
    if (col[0] == " "):
        data_raw.rename(columns={col: col.strip()}, inplace=True)
        data_features.rename(columns={col: col.strip()}, inplace=True)
        
for col in data_features.columns:
    if (col[0] == " "):
        data_features.rename(columns={col: col.strip()}, inplace=True)
        
        
data_features.to_csv("./data/data_features.csv")
data_raw.to_csv("./data/data_raw.csv")
pd.Series(labels_features).to_csv("./data/labels_features.csv")
pd.Series(labels_raw).to_csv("./data/labels_raw.csv")
          

In [4]:
seed = 5550

# Training using features
feat_data_train, feat_data_test, feat_lab_train, feat_lab_test = sklearn.model_selection.train_test_split(data_features,
                                                                                                        labels_features,
                                                                                                        train_size=0.8, 
                                                                                                        random_state=seed, 
                                                                                                        stratify=labels_features)
'''
feat_data_train, feat_data_val, feat_lab_train, feat_lab_val  = sklearn.model_selection.train_test_split(feat_data_train,
                                                                          feat_lab_train,
                                                                          train_size=0.8, 
                                                                          random_state=seed)'''

raw_data_train, raw_data_test, raw_lab_train, raw_lab_test = sklearn.model_selection.train_test_split(data_raw,
                                                                        labels_raw,
                                                                        train_size=0.8, 
                                                                        random_state=seed, 
                                                                        stratify=labels_raw)

'''
raw_data_train, raw_data_train, raw_lab_train, raw_lab_test = sklearn.model_selection.train_test_split(raw_data_train,
                                                                          raw_lab_train,
                                                                          train_size=0.8, 
                                                                          random_state=seed)'''

'\nraw_data_train, raw_data_train, raw_lab_train, raw_lab_test = sklearn.model_selection.train_test_split(raw_data_train,\n                                                                          raw_lab_train,\n                                                                          train_size=0.8, \n                                                                          random_state=seed)'

In [5]:
scaler = sklearn.preprocessing.StandardScaler()

scaler.fit(feat_data_train)

feat_train = scaler.transform(feat_data_train)
feat_test = scaler.transform(feat_data_test)

In [6]:
from sklearn.svm import SVC
model_SVM = SVC(kernel='rbf', C=10, gamma=0.1)

#training the model
model_SVM.fit(feat_train, feat_lab_train)

#applying the the model on the test data (features)
lab_predict =  model_SVM.predict(feat_test)


#print the number of misclassified samples, accuracy and complete report (using scikit learn metric tools) 
print('Number of mislabeled samples %d out of %d' % ((feat_lab_test != lab_predict).sum(),feat_lab_test.size))
print('Accuracy:',sklearn.metrics.accuracy_score(feat_lab_test, lab_predict))

Number of mislabeled samples 5 out of 32
Accuracy: 0.84375


In [19]:
# Export ONNX for Unity
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
initial_type = [('float_input', FloatTensorType([None, 500]))]

model_name = "dsc_sklearn_svm_feat"

onnx_model_svm_feat = skl2onnx.convert_sklearn(model_SVM, 
                                            initial_types=initial_type,
                                            name=model_name, 
                                            target_opset=9,
                                            verbose=0)


skl2onnx.helpers.onnx_helper.save_onnx_model(onnx_model_svm_feat, model_name + ".onnx")

b'\x08\x04\x12\x08skl2onnx\x1a\x061.14.0"\x07ai.onnx(\x002\x00:\xc8\x99\x01\n\xbc\x84\x01\n\x0bfloat_input\x12\x05label\x12\x05SVM02\x1a\x04SVMc"\rSVMClassifier*\x1d\n\x10classlabels_ints@\x00@\x01@\x02@\x03\xa0\x01\x07*\xfc\x0b\n\x0ccoefficients=\x81v\x90>=\xbe\xdb\x8c?=:\x18\r?=\x08a,?=.6\x07?=\xd99u?=x|\xff>=\xb83Q>=hO\xd7>=\x00\x00\x00\x80=K\x7fY\xbe=\x05^X\xbe=~\xe5p\xba=\xd1 \xb8\xbe=f[\xad\xbe=\x04\x82Y\xbe=\x00\x00\x00\x80=\xef}d\xbd=+\xdd\xf3\xbd=y\xe6\xfb\xbe=\xa8\xfe\x04\xbe=\x00\x00\x00\x80=G\x9c\xff\xbb=z\x8a\xae\xbe=\xfb4\xad\xbe=\x00\x00\x00\x80=H\x05\xd6\xbb=\x00\x00\x00\x80=\x7f*\xe8\xbd=\xba\x1f\x0e\xbe=5\x1a\x9c\xbd=\xcd@X\xbe=>i\x0c\xbf=\x00\x00\x00\x80=\x00\x00\x00\x80=f\xe1\x83\xbe=\x86\xa4\x9f\xbe=8V\xb7\xbe=j\xb6\t\xbe=\xbcVw\xbe=`\x8e\xd6\xbe=\x00\x00\x00\x80=%\xfd\x03\xbf=\x04\xcd\xb8\xbe=\xa9\xad\x8a\xbe=\x00\x00\x00\x80=\x0c\x99v\xbc=0 \xf4\xbd=\x00\x00\x00\x80=\xff6M\xbe=`L\xf2\xbe=\x00\x00\x00\x80=]\xd2%\xbd=LZ\xd3\xbc=\x00\x00\x00\x80=y\x0b\xc1\xbe=nG+\xb

In [42]:
# Raw Data Model
model_name = "dsc_sklearn_svm_raw"
svm_kernel = "rbf"
svm_gamma = 0.15
c_value = 8

scaler = sklearn.preprocessing.StandardScaler()

scaler.fit(raw_data_train)

raw_train = scaler.transform(raw_data_train)
raw_test = scaler.transform(raw_data_test)

model_SVM_raw = SVC(kernel=svm_kernel, C=c_value, gamma=svm_gamma)

start_time_training = time.time()
model_SVM_raw.fit(raw_train, raw_lab_train)
total_training_time = time.time() - start_time_training

lab_predict =  model_SVM_raw.predict(raw_test)

accuracy_score = sklearn.metrics.accuracy_score(raw_lab_test, lab_predict)
#print the number of misclassified samples, accuracy and complete report (using scikit learn metric tools) 
print('Number of mislabeled samples %d out of %d' % ((raw_lab_test != lab_predict).sum(),raw_lab_test.size))
print('Accuracy:',accuracy_score)

# Save Metrics
now = datetime.now()

# dd/mm/YY H:M:S
date_and_time = now.strftime("%d/%m/%Y %H:%M:%S")

save_metrics("./models/metrics.csv", date_and_time, model_name, accuracy_score,c_value,svm_gamma,svm_kernel,raw_lab_test.size, total_training_time, seed)

onnx_model_svm_raw = skl2onnx.convert_sklearn(model_SVM_raw, 
                                            initial_types=initial_type,
                                            name=model_name, 
                                            target_opset=9,
                                            verbose=0)

saved_model = skl2onnx.helpers.onnx_helper.save_onnx_model(onnx_model_svm_raw, "./models/" + model_name + ".onnx")

Number of mislabeled samples 1376 out of 4000
Accuracy: 0.656
