# Exploratory data analysis (EDA) : trip statistics

# Purpose
Some statistics have been generated for each trip. This will be explored in this notebook, to find some possible patterns in the data.

# Methodology
* Load the statistics.
* Descriptive statistics.
* Make some nice seaborn plots
* Generate heat map.

# Setup

In [None]:
# %load imports.py
#%load imports.py
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
width=20
height=3
plt.rcParams["figure.figsize"] = (width,height)
sns.set(rc={'figure.figsize':(width,height)})

#import seaborn as sns
import os
from collections import OrderedDict

from IPython.display import display

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)

import folium
import plotly.express as px
import plotly.graph_objects as go

import sys
import os
sys.path.append('../')
from src.visualization import visualize
from src.data import get_dataset
from src.data import trips
import scipy.integrate



In [None]:
df_stat = get_dataset.trip_statistics()

In [None]:
df_stat.shape

In [None]:
df_stat.info()

In [None]:
df_stat.describe()

In [None]:
df_stat.head()

## Time and direction matter

In [None]:
grid = sns.relplot(data=df_stat, x='start_time',y='power_em_thruster_total', hue='trip_direction')
fig = grid.ax.get_figure()
fig.set_size_inches(20,3)

The analysis will begin looking at ```trip_direction=0```

## Heat map

In [None]:
df = df_stat.groupby(by='trip_direction').get_group(0).copy()
df.drop(columns=['end_time','trip_direction'], inplace=True)
df['start_time'] = pd.TimedeltaIndex(df['start_time'] - df.iloc[0]['start_time']).total_seconds()

corr = df.corr().abs()

ax = sns.heatmap(corr, vmin=0, vmax=1, yticklabels=corr.index, cmap='Blues')
fig = ax.get_figure()
fig.set_size_inches(9,9)

In [None]:
corr_power = corr.loc['power_em_thruster_total'].sort_values(ascending=False).iloc[1:].copy()
corr_power

In [None]:
mask = corr_power > 0.4
corr_power_important = corr_power[mask].copy()
corr_power_important

In [None]:
g = sns.PairGrid(df[corr_power_important.index])
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=3, legend=False)