In [230]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot,iplot
from scipy.stats import norm, kurtosis
import os
from scipy.signal import butter, lfilter, freqz
from scipy import signal
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from joblib import dump, load
import pandas as pd
from processing_funcs import *

In [231]:
df = read_measurement("../data/raw_data_train/rsq_q2/*", 0.01)

In [232]:
def kurtosis_time(x):

    return kurtosis(x, fisher=True)

def rms_100(x):

    return np.sqrt(np.mean(x**2))

def crest(x):

    return max(abs(x))/np.sqrt(np.mean(x**2))

def create_aggregated(df):
    """Create a aggregated dataframe in time domain"""
    signals = ['x_lin_acc', 'y_lin_acc', "z_lin_acc", 
               'x_acc', 'y_acc', 'z_acc',
               'x_gyro', 'y_gyro', 'z_gyro']

    agg_df = df.groupby(["id_gps"]).agg({x: ["sum", "mean", "mad",
                                                "median", "min", "max",
                                                "std", "var", "sem",
                                                "skew", "quantile",
                                                kurtosis_time, rms_100,
                                                crest] for x in signals})

    return agg_df

In [233]:
# filtered_df = filter_acc(df)
# new_df = df.drop(filtered_df.columns, axis=1)
# df = pd.concat([new_df.reset_index(),filtered_df.reset_index()], axis=1)

In [234]:
feature_df = create_aggregated(df).reset_index()

In [235]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(feature_df.drop("id_gps", axis=1)) 


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



In [236]:
model = load("../models/rfc_v07.joblib")

In [237]:
preds = model.predict(scaled)

In [238]:
result = pd.concat([feature_df,pd.DataFrame(preds, columns=["label"])],axis = 1)

In [239]:
result = result[[('id_gps', ''),"label"]]
result.columns = ["id_gps", "label"]


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray



In [240]:
df

Unnamed: 0,time,lat,lon,height,velocity,direction,h_accuracy,v_accuracy,id_gps,x_lin_acc,y_lin_acc,z_lin_acc,x_gyro,y_gyro,z_gyro,x_acc,y_acc,z_acc
15,0.15,47.686814,17.653652,110.883219,-1.000000,-1.00000,65.000000,65.000000,1,-0.049348,0.084456,-0.160482,-0.003136,0.009028,-0.005300,-0.218546,-0.374371,9.670789
16,0.16,47.686814,17.653652,110.883219,-1.000000,-1.00000,65.000000,65.000000,1,0.077977,0.111099,-0.018121,0.002095,0.011682,-0.007544,-0.302970,-0.439636,9.762399
17,0.17,47.686814,17.653652,110.883219,-1.000000,-1.00000,65.000000,65.000000,1,-0.098842,0.208655,-0.313301,-0.004548,-0.010443,-0.011471,-0.240699,-0.306263,9.581575
18,0.18,47.686814,17.653652,110.883219,-1.000000,-1.00000,65.000000,65.000000,1,-0.124909,0.033006,-0.035912,-0.001665,0.005436,-0.012404,-0.471519,-0.396525,9.637110
19,0.19,47.686814,17.653652,110.883219,-1.000000,-1.00000,65.000000,65.000000,1,-0.107322,0.009384,-0.110293,-0.009603,-0.007755,-0.010834,-0.383353,-0.495919,9.712852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18826,188.26,47.685595,17.679963,116.472371,10.812548,2.78943,6.000906,6.000906,186,0.549217,-0.061458,-0.607367,0.005606,0.009188,0.000091,0.201032,-0.748294,9.251661
18827,188.27,47.685595,17.679963,116.472371,10.812548,2.78943,6.000906,6.000906,186,0.329389,-0.066965,-0.430904,0.002500,-0.005380,-0.005648,-0.142354,-0.743204,9.480385
18828,188.28,47.685595,17.679963,116.472371,10.812548,2.78943,6.000906,6.000906,186,-0.005401,-0.129137,-0.356178,-0.000041,-0.008369,-0.015584,-0.444426,-0.725541,9.335187
18829,188.29,47.685595,17.679963,116.472371,10.812548,2.78943,6.000906,6.000906,186,-0.365050,-0.114036,-0.492048,-0.000269,-0.001953,-0.022030,-0.518073,-0.573457,9.212892


In [241]:
final_result = df.merge(result, on="id_gps", how = "outer")

###  Testing the merge

If the groupped result dataframe id_gps is the same as the result then the merge is correct.

In [242]:
final_result.groupby("id_gps").count()

Unnamed: 0_level_0,time,lat,lon,height,velocity,direction,h_accuracy,v_accuracy,x_lin_acc,y_lin_acc,z_lin_acc,x_gyro,y_gyro,z_gyro,x_acc,y_acc,z_acc,label
id_gps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111
2,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240
3,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
4,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
5,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
183,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
184,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
185,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100


In [243]:
num = (result["id_gps"] == final_result.groupby("id_gps").count().index).sum()

In [244]:
final_result  = final_result.groupby("id_gps").mean()

In [245]:
fig = px.scatter_mapbox(final_result, 
                        lat="lat", 
                        lon="lon",
                        zoom = 12, 
                        height=500, 
                        color = "label", 
                        color_continuous_scale=["green","blue","red"])
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.layout.coloraxis.showscale = False
fig

In [246]:
real = [1 for  i in range(num)]

In [247]:
final_result["label"].values

array([0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 1, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0,
       1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2,
       0, 0, 1, 2, 1, 0, 0, 0, 2, 2, 2, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       2, 1, 1, 2, 0, 0, 0, 1, 2, 0, 1, 0, 2, 2, 1, 1, 1, 0, 1, 0, 2, 0,
       1, 1, 2, 1, 1, 1, 1, 2, 0, 1, 0, 1, 0, 2, 0, 1, 0, 2, 0, 2, 0, 0,
       0, 1, 0, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 0, 2, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 0, 1, 2, 1, 0, 0,
       2, 1, 1, 2, 0, 0, 0, 2, 2, 0], dtype=int64)

In [248]:
from sklearn.metrics import accuracy_score

accuracy_score(real, final_result["label"].values)

0.2903225806451613

In [249]:
len(final_result)

186

In [190]:
Counter(final_result["label"]).most_common()

[(0, 542), (1, 62), (2, 27)]