# Data extraction of trips

# Purpose
As a first step the time series data will be divided into trips, as a data reduction. Energy consumption can be calculated for each trip together with other aggregated quantities such as mean values, standard deviations etc. This will be used to analyze how much trips differ from each other over the year.

# Methodology
* Find a good logical condition to distinquish between the various trips from the time series.

# Setup

In [None]:
#%load imports.py
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,3)

#import seaborn as sns
import os
from collections import OrderedDict

from IPython.display import display

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)

import folium
import plotly.express as px
import plotly.graph_objects as go

import sys
import os
sys.path.append('../')
from src.visualization import visualize
from src.data import prepare_dataset
from src.data import trips
import scipy.integrate
import seaborn as sns


In [None]:
df = prepare_dataset.get_dataset(name='tycho_short', n_rows=200000)

## Plot maps

In [None]:
visualize.plot_map(df=df, width=1000, height=600, zoom_start=14)

## Identify trips

In [None]:
df_ = df.iloc[0:5000]
df_.sort_index(inplace=True)

mask = df_.index.to_series().diff() > '0 days 00:00:20'

df_starts = df_.loc[mask].copy()

mask = np.roll(mask,-1)
mask[-1] = False
df_ends = df_.loc[mask].copy()

# Removing end of first incomplete trip
if df_ends.index[0] < df_starts.index[0]:
    df_ends=df_ends.iloc[1:].copy()

# Removing start of last incomplete trip
if df_starts.index[-1] > df_ends.index[-1]:
    df_starts=df_starts.iloc[0:-1].copy()

    
assert len(df_starts) == len(df_ends)

In [None]:
fig1 = px.line(df_, y='sog', template="plotly_dark", width=1500, height=400,)

fig2 = px.scatter(df_starts, y='sog', template="plotly_dark",  width=1500, height=400)
fig3 = px.scatter(df_ends, y='sog', template="plotly_dark",  width=1500, height=400)

fig = go.Figure(data=fig1.data + fig2.data + fig3.data)

fig.show()

In [None]:
df_starts['trip_no'] = np.arange(len(df_starts),dtype=int)

for (start_time, start), (end_time, end) in zip(df_starts.iterrows(), df_ends.iterrows()):
    
    mask = ((start_time <= df_.index) & 
            (df_.index <= end_time)
           )
    
    df_.loc[mask,'trip_no'] = start['trip_no']
    
df_2 = df_.dropna(subset=['trip_no'])  # drop unfinnished trips

In [None]:
groups = df_2.groupby(by='trip_no')
df_2['trip_time'] = groups['trip_no'].transform(lambda x : x.index - x.index[0] )

In [None]:
fig = px.line(df_2, y='sog', template="plotly_dark", color='trip_no', width=1500, height=400,)
fig.show()


fig = px.line(df_2, x='trip_time', y='sog', template="plotly_dark", color='trip_no', width=1500, height=400,)
fig.show()

In [None]:
df.head()

In [None]:
#df_ = 
#df_ = calculate_rudder_angles(df=df)

## Use package implementation to do the same thing:

In [None]:
df_2 = trips.divide(df=df, trip_separator='0 days 00:02:00')

In [None]:
groups = df_2.groupby(by='trip_no')
df_3 = groups.resample('60S').mean()
df_3.dropna(inplace=True)
df_3.index = df_3.index.get_level_values(1)

df_3['trip_time'] = groups['trip_no'].transform(lambda x : x.index - x.index[0] )

In [None]:
fig = px.line(df_3, x='trip_time', y='sog', template="plotly_dark", color='trip_no', width=1500, height=400)
fig.show()

In [None]:
trip = groups.get_group(1)
visualize.plot_map(df=trip)

In [None]:
trip.describe()

In [None]:
trip['power_em_thruster_total']

In [None]:
def integrate_power(trip):
    t = pd.TimedeltaIndex(trip['trip_time']).total_seconds()
    energy_em_thruster_total = scipy.integrate.simps(y=trip['power_em_thruster_total'],x=t)
    return energy_em_thruster_total

In [None]:
energy_em_thruster_total = groups.apply(func=integrate_power)

In [None]:
ax = sns.histplot(data=energy_em_thruster_total, x=None, stat="density", bins=20, kde=True);
ax.set_xlabel('energy_em_thruster_total')

In [None]:
best = energy_em_thruster_total.sort_values(ascending=True).index[0]
worst = energy_em_thruster_total.sort_values(ascending=False).index[0]


In [None]:
trip = groups.get_group(worst)
visualize.plot_map(df=trip, color_key='cog')

In [None]:
trip = groups.get_group(best)
visualize.plot_map(df=trip)

## Save trips
Save a new dataset with *trip_no*, *trip_time* and correct column names etc.

In [None]:
from azureml.core import Workspace, Dataset

subscription_id = '3e9a363e-f191-4398-bd11-d32ccef9529c'
resource_group = 'demops'
workspace_name = 'D2E2F'

name='tycho_short'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [None]:
def save():
    
    df_2_save = df_2.copy()
    df_2_save.reset_index(inplace=True)
    df_2_save['time'] = df_2_save['time'].astype(str)
    df_2_save['trip_time'] = df_2_save['trip_time'].astype(str)
    df_2_save['trip_no'] = df_2_save['trip_no'].astype(int)
    
    datastore = workspace.get_default_datastore()
    dataset_2 = Dataset.Tabular.register_pandas_dataframe(dataframe=df_2_save, target=datastore, name=new_name)


In [None]:
new_name = 'tycho_short_id'
if not new_name in workspace.datasets:
    save()
    