In [330]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot,iplot
from scipy.stats import norm, kurtosis
import os
from scipy.signal import butter, lfilter, freqz
from scipy import signal
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from joblib import dump, load
import pandas as pd
from processing_funcs import *

In [331]:
df = read_measurement("../data/raw_data_train/rsq_q3/*", 0.01)

In [332]:
def butter_high(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def butter_high_filter(data, cutoff, fs, order=5):
    b, a = butter_high(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [333]:
def filter_acc(df, cutoff=2, fs=50, order=2):

    signals = [ 'x_lin_acc', 'y_lin_acc', 'z_lin_acc','x_acc', 'y_acc', 'z_acc',]

    new_df = pd.DataFrame(columns=signals)
    list_signals = []
    

    for j in signals:

        filtered_signal = butter_high_filter(df[j], cutoff=cutoff, fs=fs, order=order)

        list_signals.append(filtered_signal)

    new_df = pd.concat([new_df, pd.DataFrame(np.array(list_signals).T,columns=signals)])

    return new_df

In [334]:
def kurtosis_time(x):

    return kurtosis(x, fisher=True)

def rms_100(x):

    return np.sqrt(np.mean(x**2))

def crest(x):

    return max(abs(x))/np.sqrt(np.mean(x**2))

def create_aggregated(df):
    """Create a aggregated dataframe in time domain"""
    signals = ['x_lin_acc', 'y_lin_acc', 
               'x_acc', 'y_acc', 'z_acc',
               'x_gyro', 'y_gyro', 'z_gyro']

    agg_df = df.groupby(["id_gps"]).agg({x: ["sum", "mean", "mad",
                                                "median", "min", "max",
                                                "std", "var", "sem",
                                                "skew", "quantile",
                                                kurtosis_time, rms_100,
                                                crest] for x in signals})

    return agg_df

In [335]:
filtered_df = filter_acc(df)
new_df = df.drop(filtered_df.columns, axis=1)
df = pd.concat([new_df.reset_index(),filtered_df.reset_index()], axis=1)

In [336]:
df

Unnamed: 0,index,time,lat,lon,height,velocity,direction,h_accuracy,v_accuracy,id_gps,x_gyro,y_gyro,z_gyro,index.1,x_lin_acc,y_lin_acc,z_lin_acc,x_acc,y_acc,z_acc
0,15,0.15,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.001077,0.001447,-0.000870,0,-0.138141,-0.007515,-0.062320,-0.140590,-0.188205,8.147690
1,16,0.16,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.002516,0.002074,0.000089,1,-0.012304,0.044320,-0.076306,-0.150044,-0.063339,5.254127
2,17,0.17,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.001361,0.004807,0.000367,2,0.217978,-0.017727,-0.051084,0.125886,-0.039097,2.941766
3,18,0.18,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.001860,0.002107,0.000095,3,0.044372,-0.040788,0.003532,0.147784,-0.064750,1.208950
4,19,0.19,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.000987,-0.002325,-0.000523,4,-0.152205,0.017539,0.025334,-0.111785,-0.005717,-0.061156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12860,12875,128.75,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.027666,-0.029211,-0.012400,12860,-0.004389,0.001195,-0.021298,0.016121,0.020779,-0.031281
12861,12876,128.76,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.035673,-0.008810,-0.015961,12861,-0.119446,0.182275,-0.065013,-0.127598,0.103127,-0.002817
12862,12877,128.77,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.041637,0.014038,-0.008203,12862,0.139896,0.206572,-0.535646,0.032854,0.242796,-0.356745
12863,12878,128.78,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.061041,0.035687,0.000275,12863,0.425744,0.121085,-0.630134,0.306075,0.179230,-0.602915


In [337]:
feature_df = create_aggregated(df).reset_index()

In [338]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(feature_df.drop("id_gps", axis=1)) 


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



In [339]:
model = load("../models/rfc_v03.joblib")

In [340]:
preds = model.predict(scaled)

In [341]:
result = pd.concat([feature_df,pd.DataFrame(preds, columns=["label"])],axis = 1)

In [342]:
result = result[[('id_gps', ''),"label"]]
result.columns = ["id_gps", "label"]


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray



In [343]:
df

Unnamed: 0,index,time,lat,lon,height,velocity,direction,h_accuracy,v_accuracy,id_gps,x_gyro,y_gyro,z_gyro,index.1,x_lin_acc,y_lin_acc,z_lin_acc,x_acc,y_acc,z_acc
0,15,0.15,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.001077,0.001447,-0.000870,0,-0.138141,-0.007515,-0.062320,-0.140590,-0.188205,8.147690
1,16,0.16,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.002516,0.002074,0.000089,1,-0.012304,0.044320,-0.076306,-0.150044,-0.063339,5.254127
2,17,0.17,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.001361,0.004807,0.000367,2,0.217978,-0.017727,-0.051084,0.125886,-0.039097,2.941766
3,18,0.18,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.001860,0.002107,0.000095,3,0.044372,-0.040788,0.003532,0.147784,-0.064750,1.208950
4,19,0.19,47.686343,17.679912,113.113219,-1.000000,-1.000000,65.000000,65.000000,2,-0.000987,-0.002325,-0.000523,4,-0.152205,0.017539,0.025334,-0.111785,-0.005717,-0.061156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12860,12875,128.75,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.027666,-0.029211,-0.012400,12860,-0.004389,0.001195,-0.021298,0.016121,0.020779,-0.031281
12861,12876,128.76,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.035673,-0.008810,-0.015961,12861,-0.119446,0.182275,-0.065013,-0.127598,0.103127,-0.002817
12862,12877,128.77,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.041637,0.014038,-0.008203,12862,0.139896,0.206572,-0.535646,0.032854,0.242796,-0.356745
12863,12878,128.78,47.694844,17.675054,117.642292,11.712904,85.817919,6.000906,6.000906,129,0.061041,0.035687,0.000275,12863,0.425744,0.121085,-0.630134,0.306075,0.179230,-0.602915


In [344]:
final_result = df.merge(result, on="id_gps", how = "outer")

###  Testing the merge

If the groupped result dataframe id_gps is the same as the result then the merge is correct.

In [345]:
final_result.groupby("id_gps").count()

Unnamed: 0_level_0,index,time,lat,lon,height,velocity,direction,h_accuracy,v_accuracy,x_gyro,y_gyro,z_gyro,index,x_lin_acc,y_lin_acc,z_lin_acc,x_acc,y_acc,z_acc,label
id_gps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25
3,160,160,160,160,160,160,160,160,160,160,160,160,160,160,160,160,160,160,160,160
4,151,151,151,151,151,151,151,151,151,151,151,151,151,151,151,151,151,151,151,151
5,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
6,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
126,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
127,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
128,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100


In [346]:
(result["id_gps"] == final_result.groupby("id_gps").count().index).sum()

128

In [347]:
final_result  = final_result.groupby("id_gps").mean()

In [348]:
fig = px.scatter_mapbox(final_result, 
                        lat="lat", 
                        lon="lon",
                        zoom = 12, 
                        height=500, 
                        color = "label", 
                        color_continuous_scale=["green","blue","red"])
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.layout.coloraxis.showscale = False
fig

In [349]:
real = [0 for  i in range(631)]

In [350]:
from sklearn.metrics import accuracy_score

accuracy_score(real,final_result["label"])

ValueError: Found input variables with inconsistent numbers of samples: [631, 128]

In [None]:
len(final_result)

In [252]:
Counter(final_result["label"]).most_common()

[(0, 517), (1, 93), (2, 21)]