In [1]:
from pipeline.data_access.dao.sussex_huawei_dao import SussexHuaweiDAO
from pipeline.feature_engineering.preprocessing.sussex_huawei_preprocessor import SussexHuaweiPreprocessor
from pipeline.feature_engineering.feature_extraction.baseline_extractor import BaselineExtractor
from pipeline.machine_learning.model.sklearn_model_factory import SklearnModelFactory
from pipeline.machine_learning.model.tslearn_model_factory import TslearnModelFactory
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pandas
import pickle
import numpy as np
from scipy.stats import randint as sp_randint

In [2]:
# 0. Initialize Pipeline Objects (TODO: Put into a pipleine Facade)
dao = SussexHuaweiDAO()
preprocessor = SussexHuaweiPreprocessor()
extractor = BaselineExtractor()
tslearn_factory, sklearn_factory = TslearnModelFactory(),  SklearnModelFactory()

In [None]:
# 1. Load Data
data_column_names = ['time', 'acceleration_x', 'acceleration_y', 'acceleration_z', #TODO: Pack in config/.env
                             'orientation_w', 'orientation_x', 'orientation_y', 'orientation_z',
                             'gravity_x', 'gravity_y', 'gravity_z'
                             ]
label_column_names = ['coarse_label', 'fine_label', 'road_label']

#bad trips: 310517, 260417, 200617, 160517, 150317, 090517, 050517
trips = [
        '010317', '010617', '020317', 
        '020517', '020617', '030317', '030517', '030617', '030717',
        '040517', '040717', '050617', '050717', '060317', '060617',
        '070317', '070617', '080317', '080517', '080617', '090317', 
        '090617', '100317', '100517', '110517', '120517', '120617',
        '130317', '130617', '140317', '140617', '150517', '150617', 
        '160317', '170317', '170517', '180417', '190417', '190517',
        '200317', '200417', '200517', '210317', '220317', '220517', 
        '220617', '230317', '230517', '230617', '240417', '240517', 
        '250317', '250417', '250517', '260517', '260617', '270317',
        '270417', '270617', '280317', '280417', '280617', '290317',
        '290517', '290617', '300317', '300517', '300617'
]

#trips = random.sample(trips, len(trips)//2)

data_string = "./data_sets/sussex_huawei/User1/{}/Hips_Motion.txt"
label_string = "./data_sets/sussex_huawei/User1/{}/Label.txt"
use_data_cols = [0,1,2,3,10,11,12,13,14,15,16]#4,5,6,7,8,9,17,18,19
use_label_cols = [1, 2, 3]

labels, data = dao.bulk_read_data(
    file_path=[
        data_string,
        label_string
    ],
    identifiers=trips,
    column_names=[
        data_column_names,
        label_column_names
    ],
    use_columns=[
        use_data_cols,
        use_label_cols
    ]
)


In [None]:
# 2. Preprocessing
# 2.1 Convert unix time (ms) to date time
data = preprocessor.convert_unix_to_datetime(data, column = 'time', unit = 'ms')

In [None]:
# 2.2 Label data and remove NaNs
data = preprocessor.label_data(data, labels)
data = preprocessor.remove_nans(data, replacement_mode='del_row')

In [None]:
# 2.3 Normalization
acelerometer_columns = ['acceleration_x', 'acceleration_y', 'acceleration_z']
gravity_columns = ['gravity_x', 'gravity_y', 'gravity_z']
orientation_columns = ['orientation_x', 'orientation_y', 'orientation_z', 'orientation_w']

data = preprocessor.project_accelerometer_to_global_coordinates(
            data, 
            mode ='gravity', 
            target_columns = acelerometer_columns,
            args = gravity_columns)

data = preprocessor.project_accelerometer_to_global_coordinates(
            data, 
            mode ='orientation', 
            target_columns = acelerometer_columns,
            args = orientation_columns)


#data = preprocessor.znormalize_quantitative_data(data, data_column_names[1:])
#data = preprocessor.min_max_normalize_quantitative_data(data, data_column_names[1:])
print(data.shape)

In [None]:
# 2.4 Segment data
# Coarse Label: Null=0, Still=1, Walking=2, Run=3, Bike=4, Car=5, Bus=6, Train=7, Subway=8
# Road Label: City=1, Motorway=2, Countryside=3, Dirt road=4, Null=0
selected_coarse_labels = [5]
selected_road_labels = [1, 3]
car_segments = preprocessor.segment_data(data, mode='labels', 
                                 label_column='coarse_label', 
                                 args=selected_coarse_labels)

#print(car_segments)
data_segments = []
for car_segment in car_segments:
        road_segments = preprocessor.segment_data(car_segment, mode='labels', 
                                  label_column='road_label',
                                  args=selected_road_labels
                                )
        for road_segment in road_segments:
            data_segments.append(road_segment)   
            
print(len(data_segments))

In [None]:
# 2.5 Low Pass filtering -> #100 Hz to 40 Hz
for ind in range(len(data_segments)):
    data_segments[ind] = data_segments[ind].set_index('time')
    data_segments[ind] = preprocessor.resample_quantitative_data(data_segments[ind], freq='1000ms')
    #current.1000ms 10 hz

In [None]:
# 2.6 Outlier removal:
for ind in range(len(data_segments)):
    data_segments[ind] = preprocessor.remove_outliers_from_quantitative_data(
        data_segments[ind],
        replacement_mode = 'quantile',
        columns = acelerometer_columns,
        quantile = 0.95 #current run @0.99
    )

In [None]:
# 2.7 Dimensionality reduction:
for ind in range(len(data_segments)):
    data_segments[ind] = preprocessor.min_max_normalize_quantitative_data(
    preprocessor.reduce_quantitativ_data_dimensionality(
        data = data_segments[ind],
        mode ='euclidean',
        columns = acelerometer_columns,
        reduced_column_name = 'acceleration_abs'
    ), ['acceleration_abs'])

In [None]:
#2.8 Prepare for Basline Extractor
selected_columns = ['acceleration_abs', 'road_label']
data = preprocessor.de_segment_data(data_segments, selected_columns)
#data = preprocessor.znormalize_quantitative_data(data, ['acceleration_abs'])
#data = preprocessor.min_max_normalize_quantitative_data(data, ['acceleration_abs'])

In [None]:
# Visual anlaysis of the segments:
#sns.set(rc={'figure.figsize':(15, 4)})
#fig, ax = plt.subplots(figsize=(15,4*len(data_segments)), ncols=1, nrows=len(data_segments)+1)
#for ind in range(len(data_segments)): 
#    sns.lineplot(y='acceleration_abs', x='time', data = data_segments[ind], ax=ax[ind])
#    ax[ind].legend("Road" if data_segments[ind]['road_label'].iloc[0] < 2.0 else "City" )
    

In [None]:
#plt.figure(figsize=(16, 6))
sns.lineplot(data=data[['acceleration_abs', 'road_label']])

In [None]:
# 3. Feature Extraction
# 3.1 Encode categorical to binary
data = preprocessor.encode_categorical_features(data = data, 
                                                mode = 'custom_function', 
                                                columns = ['road_label'],
                                                encoding_function = lambda x :  (x  > 2.0).astype(int)
                                               ) #0 City, 1 Countryside

# 3.2
# Generate label vector y and feature matrix X.
# We need at least 2 classes to learn features for tsfresh
y = data[['road_label']].reset_index(drop=True)
data['id'] = range(1, len(data) + 1)
y['id'] = data['id']
y['road_label'].index=list(y['id'])

# 3.3 Extract feature matrix
# Read https://github.com/blue-yonder/tsfresh/issues/444 for info about the warnings
X = extractor.extract_features(data = data, args = ['id', y['road_label'], 32, None, 0.1])

In [24]:
#3.3.1 Read/Write extracted features
#dao.write_features('./data_sets/X.pkl', X)
#dao.write_features('./data_sets/y.pkl', y)
X = dao.load_features('./data_sets/X.pkl')
y = dao.load_features('./data_sets/y.pkl')
print(len(y))
keys = X.keys()
keys = list(filter(lambda x: "acceleration_abs" in x, keys))

88846


In [25]:
# 3.4 combine feature rows
X_join = pandas.concat([X, y], axis=1)
X_join = preprocessor.remove_nans(X_join, replacement_mode='del_row')
X_join[['road_label']] = X_join[['road_label']].astype('int')
X_segments = preprocessor.segment_data(X_join, mode='labels', 
                                    label_column='road_label', 
                                    args=[0,1])


segment_length = 20 #60s best in paper, 90 best in my evaluation, tested 30, 60, 90, 120
X_segments_new = []
for ind in range(0, len(X_segments)):
    X_segments_new = X_segments_new + preprocessor.segment_data(
        X_segments[ind],
        mode = 'fixed_interval', 
        args = [segment_length, True, True]
    )
    
    
print(len(X_segments_new))
keys.append('road_label')
X_combined = preprocessor.de_segment_data(X_segments_new, keys)
X_combined, y_combined = X_combined[keys[:-1]], X_combined[keys[-1]]

59


In [26]:
#import matplotlib.pyplot as plt
#X_combined.hist(figsize=(15,15)) #check ditrsibution -> normal

In [27]:
#plt.figure(figsize=(10,10))
#plt.matshow(X_combined.corr(), fignum=1)

In [28]:
#plt.figure(figsize=(10,10))
#plt.matshow(X_combined.cov(), fignum=1)

In [29]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from tslearn.clustering import TimeSeriesKMeans
from tslearn.clustering import GlobalAlignmentKernelKMeans
from sklearn.ensemble import IsolationForest

# 3.5 Read/Write combined features
#print(type(y_combined))
dao.write_features('./data_sets/X_combined.pkl', X_combined)
dao.write_features('./data_sets/y_combined.pkl', y_combined)
# 3.6  extracted features
X_combined = dao.load_features('./data_sets/X_combined.pkl')
y_combined = dao.load_features('./data_sets/y_combined.pkl')

print(X_combined.shape)
print(list(y_combined).count(0)/len(y_combined))
  
y_clustering = IsolationForest(behaviour='new', 
                               max_samples=10, 
                               n_jobs=-1, 
                               contamination=0.45,
                               max_features=1.0,
                               n_estimators=1750
                              ).fit_predict(X_combined)

X_combined = X_combined.loc[pandas.DataFrame(y_clustering)[0] == 1]
y_combined = y_combined.loc[pandas.DataFrame(y_clustering)[0] == 1]

print(X_combined.shape)
print(list(y_combined).count(0)/len(y_combined))

X_combined = X_combined.reset_index(drop=True)
y_combined = y_combined.reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(X_combined,
                                                    y_combined,
                                                    test_size=0.3,
                                                    stratify=y_combined
                                                    )

#X_combined.hist(figsize=(15,15)) #check ditrsibution -> normal

(3826, 20)
0.5846837428123366
(2104, 20)
0.5960076045627376


In [None]:
#print(X_test)
#num_cols = len(list(X_combined))
#X_combined.columns = np.arange(num_cols)
#X_combined  = (X_combined-X_combined.mean())/X_combined.std()

#num_cols = len(list(X_test))
#X_test.columns = np.arange(num_cols)
#X_test = (X_test-X_test.mean())/X_test.std()
#print(X_test)

In [None]:
# 4.1 Produce models from a given hyper parameter search space
#TODO: Put model types and parametzer and search spaces into config.
from sklearn.metrics import confusion_matrix
print('------------------Sklearn-----------------')
model = sklearn_factory.create_model(
    model_type = 'svc',
    X = X_combined, 
    y = y_combined, 
    model_params = {
        'kernel': ['rbf', 'linear','poly'],
        'degree': sp_randint(2, X_combined.shape[1]*3),
        'gamma': np.concatenate((10.0 ** -np.arange(0, 10),10.0 ** np.arange(1, 10))),
        'C': sp_randint(2, 5000),
        'max_iter' : sp_randint(2, 5000),
        'shrinking' : [True, False],
        'probability' : [True, False],
        'random_state': sp_randint(1, 10),
    },
    search_params = [-1, 0, 10, 2500, True, "svc_rs.pickle", 0.1]
    )
print('------------------SVC-----------------')
print(model['clf'].score(X_test, y_test))
y_pred = model['clf'].predict(X_test)
conf = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)
print(conf)
print("\n\n")

model = sklearn_factory.create_model(
    model_type = 'cart_tree',
    X = X_combined, 
    y = y_combined, 
    model_params = {
        "max_depth": sp_randint(1, 128),
        "max_features": sp_randint(1, X_combined.shape[1]),
        "min_samples_leaf": sp_randint(1, X_combined.shape[1]),
        "criterion": ["gini", "entropy"],
        'random_state': sp_randint(1, 10),
        'splitter' : ['best', 'random'],
        'min_samples_split': sp_randint(2, 10)
    },
    search_params = [-1, 0, 10, 2500, True, "dt_rs.pickle", 0.1]
    )
print('------------------CART-Tree-----------------')
print(model['clf'].score(X_test, y_test))
y_pred = model['clf'].predict(X_test)
conf = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)
print(conf)
print("\n\n")

model = sklearn_factory.create_model(
    model_type = 'random_forrest',
    X = X_combined, 
    y = y_combined, 
    model_params = {
        'n_estimators' : sp_randint(1, 100),
        'max_depth': sp_randint(1, 128),
        'max_features': sp_randint(1, X_combined.shape[1]),
        'min_samples_split': sp_randint(2, X_combined.shape[1]),
        'bootstrap': [True, False],
        "criterion": ["gini", "entropy"],
        'random_state': sp_randint(1, 10),
        'min_samples_split': sp_randint(2, 10)
    },
    search_params = [-1, 0, 10, 2500, True, "rf_rs.pickle", 0.1]
    )
print('------------------Random Forrest----------------')
print(model['clf'].score(X_test, y_test))
y_pred = model['clf'].predict(X_test)
conf = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)
print(conf)
print("\n\n")

model = sklearn_factory.create_model(
    model_type = 'mlp_classifier',
    X = X_combined, 
    y = y_combined, 
    model_params = {
        'solver': ['adam', 'lbfgs', 'sgd', 'adam'], 
        'max_iter': sp_randint(1, 250), 
        'alpha': np.concatenate((10.0 ** -np.arange(0, 10),10.0 ** np.arange(1, 10))), 
        'hidden_layer_sizes':[(128,128,128,128),
                              (128,128,128),
                              (128,128),
                              (128),
                              (64,64,64,64),
                              (64,64,64),
                              (64,64),
                              (64),
                              (32,32,32,32),
                              (32,32,32),
                              (32,32),
                              (32),
                              (16,16,16,16),
                              (16,16,16),
                              (16,16),
                              (16)
                             ], 
        'random_state': sp_randint(1, 10),
        'activation': ["logistic", "relu", "tanh"],
        'learning_rate' : ['constant', 'invscaling', 'adaptive'],
        'learning_rate_init' : np.concatenate((10.0 ** -np.arange(0, 10),10.0 ** np.arange(1, 10))),
        'batch_size' : sp_randint(1, 10),
        'shuffle' :[True, False],
        'early_stopping' : [True, False],
    },
    search_params = [-1, 0, 10, 250, True, "mlp_rs.pickle", 0.1]
    )
print('------------------MLP----------------')
print(model['clf'].score(X_test, y_test))
y_pred = model['clf'].predict(X_test)
conf = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)
print(conf)
print("\n\n")


model = tslearn_factory.create_model(
    model_type = 'tssvc',
    X = X_combined, 
    y = y_combined, 
    model_params = {
        'kernel': ['rbf', 'linear','poly', 'gak'],
        'degree': sp_randint(2, X_combined.shape[1]*2),
        'gamma': np.concatenate((10.0 ** -np.arange(0, 5),10.0 ** np.arange(1, 5))),
        'max_iter' : sp_randint(2, 5),
        'shrinking' : [True, False],
        'probability' : [True, False],
        'random_state': sp_randint(1, 10),
    },
    search_params = [32, 0, 10, 250, True, "tssv_rs.pickle", 0.1]
    )

print('------------------Tslearn-----------------')
print('------------------TSSVC----------------')
print(model['clf'].score(X_test, y_test))
y_pred = model['clf'].predict(X_test)
conf = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)
print(conf)
print("\n\n")

model = tslearn_factory.create_model(
    model_type = 'knn_classifier',
    X = X_combined, 
    y = y_combined, 
    model_params = {
        'n_neighbors' : sp_randint(2, X_combined.shape[1]*2),
        'metric' : ['dtw', 'softdtw', 'euclidean', 'sqeuclidean', 'cityblock']
    },
    search_params = [32, 0, 10, 250, True, "tsknn_rs.pickle", 0.1]
    )
print('------------------KNNC----------------')
print(model['clf'].score(X_test, y_test))
y_pred = model['clf'].predict(X_test)
conf = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)
print(conf)
print("\n\n")

------------------Sklearn-----------------


In [None]:
print("----------------sklearn----------------")
print("MLP")
with open('mlp_rs.pickle', 'rb') as f:
    clf = pickle.load(f)
print(clf.score(X_combined, y_combined))
print(clf.best_params_)
print("\n\n")

print("CART Tree")
with open('dt_rs.pickle', 'rb') as f:
    clf = pickle.load(f)
print(clf.score(X_combined, y_combined))
print(clf.best_params_)
print("\n\n")

print("Random Forrest")
with open('rf_rs.pickle', 'rb') as f:
    clf = pickle.load(f)
print(clf.score(X_combined, y_combined))
print(clf.best_params_)
print("\n\n")

print("SVC")
with open('svc_rs.pickle', 'rb') as f:
    clf = pickle.load(f)
print(clf.score(X_combined, y_combined))
print(clf.best_params_)
print("\n\n")

print("----------------tslearn----------------")
print("TSSVC")
with open('tssv_rs.pickle', 'rb') as f:
    clf = pickle.load(f)
print(clf.score(X_combined, y_combined))
print(clf.best_params_)
print("\n\n")


print("TSKNN")
with open('tsknn_rs.pickle', 'rb') as f:
    clf = pickle.load(f)
print(clf.score(X_combined, y_combined))
print(clf.best_params_)
print("\n\n")