In [1]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot,iplot
from scipy.stats import norm, kurtosis
import os
from scipy.signal import butter, lfilter, freqz
from scipy import signal
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from joblib import dump, load
import pandas as pd
from processing_funcs import *

In [106]:
df = read_measurement("../data/raw_data_train/rsq_q1/*", 0.01)

In [107]:
def kurtosis_time(x):

    return kurtosis(x, fisher=True)

def rms_100(x):

    return np.sqrt(np.mean(x**2))

def crest(x):

    return max(abs(x))/np.sqrt(np.mean(x**2))

def create_aggregated(df):
    """Create a aggregated dataframe in time domain"""
    signals = ['x_lin_acc', 'y_lin_acc', "z_lin_acc", 
               'x_acc', 'y_acc', 'z_acc',
               'x_gyro', 'y_gyro', 'z_gyro']

    agg_df = df.groupby(["id_gps"]).agg({x: ["sum", "mean", "mad",
                                                "median", "min", "max",
                                                "std", "var", "sem",
                                                "skew", "quantile",
                                                kurtosis_time, rms_100,
                                                crest] for x in signals})

    return agg_df

In [108]:
# filtered_df = filter_acc(df)
# new_df = df.drop(filtered_df.columns, axis=1)
# df = pd.concat([new_df.reset_index(),filtered_df.reset_index()], axis=1)

In [109]:
feature_df = create_aggregated(df).reset_index()

In [110]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(feature_df.drop("id_gps", axis=1)) 


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



In [111]:
model = load("../models/rfc_v04.joblib")

In [112]:
preds = model.predict(scaled)

In [113]:
result = pd.concat([feature_df,pd.DataFrame(preds, columns=["label"])],axis = 1)

In [114]:
result = result[[('id_gps', ''),"label"]]
result.columns = ["id_gps", "label"]


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray



In [115]:
df

Unnamed: 0,time,lat,lon,height,velocity,direction,h_accuracy,v_accuracy,id_gps,x_lin_acc,y_lin_acc,z_lin_acc,x_gyro,y_gyro,z_gyro,x_acc,y_acc,z_acc
14,0.14,47.696266,17.726061,118.349480,-1.000000,-1.00000,418.967358,418.967358,1,-0.068991,0.116175,-0.231500,-0.010804,0.001345,-0.033967,0.010029,-0.066162,9.487570
15,0.15,47.696266,17.726061,118.349480,-1.000000,-1.00000,418.967358,418.967358,1,0.119553,0.105636,-0.053966,0.001346,0.008800,-0.035701,-0.054936,-0.025597,9.624087
16,0.16,47.696266,17.726061,118.349480,-1.000000,-1.00000,418.967358,418.967358,1,0.013766,-0.061842,-0.085847,-0.007075,0.005302,-0.027966,0.181722,-0.081730,9.736653
17,0.17,47.696266,17.726061,118.349480,-1.000000,-1.00000,418.967358,418.967358,1,0.063374,0.128633,-0.271607,-0.007945,0.006663,-0.028829,0.057930,-0.128134,9.693692
18,0.18,47.696266,17.726061,118.349480,-1.000000,-1.00000,418.967358,418.967358,1,0.089934,0.096254,-0.355428,-0.004748,-0.002140,-0.035702,0.103884,-0.026944,9.488020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63221,632.21,47.705009,17.716236,118.863225,0.121996,182.29248,8.001208,8.001208,631,0.066170,0.229759,-0.042592,0.001690,-0.000458,0.001453,0.424817,-0.269290,9.759106
63222,632.22,47.705009,17.716236,118.863225,0.121996,182.29248,8.001208,8.001208,631,0.115775,0.162368,-0.001455,0.003026,0.003908,0.000451,0.422422,-0.327369,9.798923
63223,632.23,47.705009,17.716236,118.863225,0.121996,182.29248,8.001208,8.001208,631,-0.089811,0.176668,-0.001088,-0.001056,-0.001030,-0.000730,0.202978,-0.290546,9.785601
63224,632.24,47.705009,17.716236,118.863225,0.121996,182.29248,8.001208,8.001208,631,-0.091544,0.223200,-0.026698,0.002019,0.003375,-0.000645,0.279319,-0.257315,9.762100


In [116]:
final_result = df.merge(result, on="id_gps", how = "outer")

###  Testing the merge

If the groupped result dataframe id_gps is the same as the result then the merge is correct.

In [117]:
final_result.groupby("id_gps").count()

Unnamed: 0_level_0,time,lat,lon,height,velocity,direction,h_accuracy,v_accuracy,x_lin_acc,y_lin_acc,z_lin_acc,x_gyro,y_gyro,z_gyro,x_acc,y_acc,z_acc,label
id_gps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,163,163,163,163,163,163,163,163,163,163,163,163,163,163,163,163,163,163
2,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
4,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
5,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
628,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
629,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
630,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100


In [118]:
num = (result["id_gps"] == final_result.groupby("id_gps").count().index).sum()

In [119]:
final_result  = final_result.groupby("id_gps").mean()

In [120]:
fig = px.scatter_mapbox(final_result, 
                        lat="lat", 
                        lon="lon",
                        zoom = 12, 
                        height=500, 
                        color = "label", 
                        color_continuous_scale=["green","blue","red"])
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.layout.coloraxis.showscale = False
fig

In [122]:
real = [0 for  i in range(num)]

In [123]:
final_result["label"].values

array([0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [124]:
from sklearn.metrics import accuracy_score

accuracy_score(real, final_result["label"].values)

0.8589540412044374

In [125]:
len(final_result)

631

In [126]:
Counter(final_result["label"]).most_common()

[(0, 542), (1, 62), (2, 27)]