In [3]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import plotly.express as px

data_folder = "../data"

## Aggregating

In [4]:
# Read locally
df = pd.read_parquet(data_folder)
df = df.sort_values("rtctime")
assert(df.shape == (8159719, 10))
df

Unnamed: 0,rtctime,lat,lon,target_temperature,feature_c,feature_ct,feature_motorspeed,ambient_temp,car_speed,soc
0,1672911588170,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997
1,1672911588180,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997
2,1672911588190,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997
3,1672911588200,68.860123,18.346718,-4.281718,21.361129,-1.0,70.731707,-3.9,1.050000,84.199997
4,1672911588210,68.860123,18.346718,-4.281718,21.361129,-1.0,70.731707,-3.9,1.110000,84.199997
...,...,...,...,...,...,...,...,...,...,...
8159714,1673371992330,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.7,65.410004,51.000000
8159715,1673371992340,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.8,65.410004,51.000000
8159716,1673371992350,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.8,65.410004,51.000000
8159717,1673371992360,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.7,65.410004,51.000000


In [None]:
# Read from S3
df = pd.read_parquet("s3://hackaburg-2023-273099990656-eu-west-1/data/")
df = df.sort_values("rtctime")
assert(df.shape == (8159719, 10))
df

In [3]:
#* Aggregate while reading filenames

#// dfs = []

#// for file_name in os.listdir(data_folder):
#//     if file_name.endswith('.parquet'):  # Assuming all files are in CSV format
#//         file_path = os.path.join(data_folder, file_name)
#//         df_filenames = pd.read_parquet(data_folder)
#//         df_filenames['filename'] = os.path.splitext(file_name)[0]
#//         dfs.append(df_filenames)

#// combined_df = pd.concat(dfs, ignore_index=True)

#// df = combined_df.sort_values("rtctime")
#// assert(df.shape == (8159719, 10))
#// df


In [5]:
#* find beginning of a record by finding gaps in rtctime

ms_gap = 10000

deltas = df['rtctime'].diff()[1:]

df["rtc_gap"] = deltas[deltas > ms_gap]

#* add column is_track_beginning

df["is_track_beginning"] = df['rtc_gap'].apply(lambda x: 1 if x > ms_gap else 0)

df.at[0, "is_track_beginning"] = 1

#* Aggregate track_id based on rtc_gap

df["track_id"] = df.is_track_beginning.cumsum(axis="index")

df


Unnamed: 0,rtctime,lat,lon,target_temperature,feature_c,feature_ct,feature_motorspeed,ambient_temp,car_speed,soc,rtc_gap,is_track_beginning,track_id
0,1672911588170,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997,,1,1
1,1672911588180,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997,,0,1
2,1672911588190,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997,,0,1
3,1672911588200,68.860123,18.346718,-4.281718,21.361129,-1.0,70.731707,-3.9,1.050000,84.199997,,0,1
4,1672911588210,68.860123,18.346718,-4.281718,21.361129,-1.0,70.731707,-3.9,1.110000,84.199997,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8159714,1673371992330,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.7,65.410004,51.000000,,0,126
8159715,1673371992340,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.8,65.410004,51.000000,,0,126
8159716,1673371992350,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.8,65.410004,51.000000,,0,126
8159717,1673371992360,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.7,65.410004,51.000000,,0,126


In [6]:
count_df = df.groupby(['track_id'])['track_id'].count()\
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False)
short_track_ids = count_df[count_df["count"] <= 2000].track_id.to_list()
long_track_ids = count_df[count_df["count"] > 2000].track_id.to_list()


df = df[df["track_id"].isin(long_track_ids)]
df


Unnamed: 0,rtctime,lat,lon,target_temperature,feature_c,feature_ct,feature_motorspeed,ambient_temp,car_speed,soc,rtc_gap,is_track_beginning,track_id
0,1672911588170,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997,,1,1
1,1672911588180,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997,,0,1
2,1672911588190,68.860123,18.346718,-4.281718,20.947325,-1.0,68.902439,-3.9,1.050000,84.199997,,0,1
3,1672911588200,68.860123,18.346718,-4.281718,21.361129,-1.0,70.731707,-3.9,1.050000,84.199997,,0,1
4,1672911588210,68.860123,18.346718,-4.281718,21.361129,-1.0,70.731707,-3.9,1.110000,84.199997,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8159714,1673371992330,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.7,65.410004,51.000000,,0,126
8159715,1673371992340,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.8,65.410004,51.000000,,0,126
8159716,1673371992350,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.8,65.410004,51.000000,,0,126
8159717,1673371992360,64.251831,20.245625,52.802200,7.737465,6.0,3387.195122,-3.7,65.410004,51.000000,,0,126


## Plotting

In [4]:
skip = 10000
skip_df = df.iloc[0:len(df):skip]

In [None]:
# plot is_training_data
fig = px.line(skip_df, x='rtctime', y='is_training_data')
fig.show()

In [None]:
# plot map
fig = px.scatter_geo(skip_df, lat=skip_df["lat"], lon=skip_df["lon"], color="track_id")
fig.update_layout(mapbox_style="stamen-terrain")

fig.show()

In [13]:
fig = px.line_mapbox(skip_df, lat="lat", lon="lon", color="track_id", zoom=3, height=600)

fig.update_layout(mapbox_style="carto-darkmatter", mapbox_zoom=4, mapbox_center_lat = 41,
    margin={"r":0,"t":0,"l":0,"b":0})

fig.write_html("colored_map.html")
fig.show()