# Data extraction of trips using Dask dataframe

# Purpose
As a first step the time series data will be divided into trips, as a data reduction. Energy consumption can be calculated for each trip together with other aggregated quantities such as mean values, standard deviations etc. This will be used to analyze how much trips differ from each other over the year.

But the file is larger than the memory can take so this solution uses a Dask dataframe instead.

# Methodology
* Loop over the dask dataframe partitions and number the trips, save to partquet in each loop.

# Setup

In [None]:
#%load imports.py
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,3)

#import seaborn as sns
import os
from collections import OrderedDict

from IPython.display import display

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)

import folium
import plotly.express as px
import plotly.graph_objects as go

import sys
import os
sys.path.append('../../../')
from src.visualization import visualize

sys.path.append('../../../src/models/pipelines/longterm/scripts/prepdata/trip_statistics')
import trip_statistics

import scipy.integrate
import seaborn as sns

import pyarrow as pa
import pyarrow.parquet as pq
import dask.dataframe

## Parameters

In [None]:
name='tycho_short_parquet_id'

In [None]:
from dask.distributed import Client, progress, TimeoutError
client = Client(n_workers=4, threads_per_worker=1, memory_limit='2GB')
client

In [None]:
file_path = f'{name}.parquet'
df = dask.dataframe.read_parquet(file_path)

In [None]:
path = f'{name}_statistics.parquet'
df_statistics = trip_statistics.process(df=df, path=path)

In [None]:
df_statistics.head()