In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pyarrow
import random

from pathlib import Path
from config.paths import ROOT_DIR, SAMPLE_DIR, PRICES_DIR

import src.fileutils as files
import src.visualization as viz
import src.process as process

In [2]:
stations_info_file = ROOT_DIR / 'data' / 'stations.csv'
sample_file_location = SAMPLE_DIR
sample_price_location = SAMPLE_DIR / 'prices'

RSEED = 42
random.seed(RSEED)
np.random.seed(RSEED)

In [29]:
prices_df = pd.read_csv(files.pick_random_csv(PRICES_DIR, random_state=RSEED))
#prices_sample = pd.read_csv(PRICES_DIR / '2021' / '08' / '2021-08-09-prices.csv')



display(prices_df.sample(3, random_state=RSEED))
display(viz.nice_summary(prices_df))

Unnamed: 0,date,station_uuid,diesel,e5,e10,dieselchange,e5change,e10change
195420,2021-08-09 15:34:06+02,c83474cf-5736-43eb-adc5-91ccb28bb5e8,1.359,1.599,1.539,3,3,3
78104,2021-08-09 10:11:06+02,001e3111-58c3-4e99-b6d5-a8f244b579f3,1.449,1.659,1.599,0,1,1
269760,2021-08-09 18:34:08+02,9b7b1b32-4532-4e99-8b29-9edb64d44a0a,1.359,1.599,1.539,0,1,1


Unnamed: 0,Columns,Dtype,nunique,Non-Null Count,Missing,Missing %,Zero Count,mean,std,min,25%,50%,75%,max
0,date,object,1257,331244,-,-,0,-,-,-,-,-,-,-
1,station_uuid,object,14999,331244,-,-,0,-,-,-,-,-,-,-
2,diesel,float64,181,331244,-,-,33,1.39,0.05,-0.0,1.36,1.39,1.42,2.0
3,e5,float64,181,331244,-,-,4769,1.6,0.2,-0.0,1.59,1.62,1.65,2.07
4,e10,float64,176,331244,-,-,14572,1.49,0.32,-0.0,1.53,1.56,1.59,2.03
5,dieselchange,int64,4,331244,-,-,65371,0.89,0.61,0.0,1.0,1.0,1.0,3.0
6,e5change,int64,4,331244,-,-,67445,0.89,0.61,0.0,1.0,1.0,1.0,3.0
7,e10change,int64,4,331244,-,-,75399,0.86,0.61,0.0,1.0,1.0,1.0,3.0


In [47]:
def extend_panel(df, date='date', individual='station_uuid', names=['date','station']):

    df = process.set_panel_index(df, date=date, individual=individual)
    timestamps = process.get_unique_timestamps(df)
    stations = process.get_unique_index(df, individual)
    new_index = pd.MultiIndex.from_product([timestamps, stations], names=names)
    return df.reindex(new_index)

df = extend_panel(prices_df)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,diesel,e5,e10,dieselchange,e5change,e10change
date,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-08-09 00:01:07+02:00,00299b2b-4fcf-454a-9dfd-871a57bfe4ff,1.349,1.589,1.529,1.0,1.0,1.0
2021-08-09 00:01:07+02:00,65b83b5f-f0c5-4db0-8be1-e111cd066276,1.349,1.589,1.529,1.0,1.0,1.0
2021-08-09 00:01:07+02:00,3e8b54ba-efd2-48ea-b948-7104f540d930,1.429,1.659,1.599,1.0,0.0,0.0
2021-08-09 00:01:07+02:00,dfc32f99-aa65-4fd4-b16e-a2cc53f9be15,1.429,1.669,1.609,1.0,1.0,1.0
2021-08-09 00:01:07+02:00,a1af2da1-8095-43ff-9640-7c0260c00048,1.419,1.609,0.000,1.0,0.0,0.0
...,...,...,...,...,...,...,...
2021-08-09 23:59:08+02:00,998a6b7c-d1c9-4b3e-9c02-ac7196e5e9cb,,,,,,
2021-08-09 23:59:08+02:00,99eaeb13-893e-483e-9c26-2876d920a250,,,,,,
2021-08-09 23:59:08+02:00,a3c05496-1f6d-4d4a-a122-8cf4b464a0c1,,,,,,
2021-08-09 23:59:08+02:00,f2ef6fe5-9589-4b4e-8792-71f8b62f27c7,,,,,,


In [48]:
df[['diesel', 'e5', 'e10']] = df.groupby(level='station')[['diesel', 'e5', 'e10']].fillna(method='ffill')


In [58]:
df = df.reset_index(level=1)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18853743 entries, 2021-08-09 00:01:07+02:00 to 2021-08-09 23:59:08+02:00
Data columns (total 7 columns):
 #   Column        Dtype  
---  ------        -----  
 0   station       object 
 1   diesel        float64
 2   e5            float64
 3   e10           float64
 4   dieselchange  float64
 5   e5change      float64
 6   e10change     float64
dtypes: float64(6), object(1)
memory usage: 1.1+ GB


In [17]:
# active_stations = prices_data09.station_uuid.unique()
# active_stations_sample = np.random.choice(active_stations, size=100)
# pds = prices_df.query('station_uuid in @active_stations_sample')
# pds



# create a table that carries all stations for each hour of the day

# group by the hour of the day, take the average price if a station is occuring more than once during that time

# if a station occurs, check the *change columns if its a 2 or a 3, and if yes, check if the price is actually different from the previous hour of if prices have just been re-reported

# if prices changed then make a 1 in the price-changed-dummies

# if there are multiple occurences of the same station within one hour, check which prices changed and make en entry for the respective dummy

# if there are multiple occurences of the same station within one hour, for each of the 3 fuel prices, count how often it changed

# take a batch for each hour of the day

# check 

df = pd.merge(timestamps, stations, how='cross').set_index(['date', 'station'])
# df = pd.merge(df, pds, how='left', on=['date', 'station_uuid']).set_index(['date', 'station_uuid'])

df

date,station
2021-08-09 00:01:07+02:00,00299b2b-4fcf-454a-9dfd-871a57bfe4ff
2021-08-09 00:01:07+02:00,65b83b5f-f0c5-4db0-8be1-e111cd066276
2021-08-09 00:01:07+02:00,3e8b54ba-efd2-48ea-b948-7104f540d930
2021-08-09 00:01:07+02:00,dfc32f99-aa65-4fd4-b16e-a2cc53f9be15
2021-08-09 00:01:07+02:00,a1af2da1-8095-43ff-9640-7c0260c00048
...,...
2021-08-09 23:59:08+02:00,998a6b7c-d1c9-4b3e-9c02-ac7196e5e9cb
2021-08-09 23:59:08+02:00,99eaeb13-893e-483e-9c26-2876d920a250
2021-08-09 23:59:08+02:00,a3c05496-1f6d-4d4a-a122-8cf4b464a0c1
2021-08-09 23:59:08+02:00,f2ef6fe5-9589-4b4e-8792-71f8b62f27c7


In [20]:

# create a MultiIndex with all combinations of timestamps and stations
new_index = pd.MultiIndex.from_product([timestamps, stations], names=['date', 'station'])

# reindex your original dataframe
prices_df = prices_df.reindex(new_index)

prices_df.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,diesel,e5,e10,dieselchange,e5change,e10change
date,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-08-09 00:01:07+02:00,00299b2b-4fcf-454a-9dfd-871a57bfe4ff,1.349,1.589,1.529,1.0,1.0,1.0
2021-08-09 00:01:07+02:00,65b83b5f-f0c5-4db0-8be1-e111cd066276,1.349,1.589,1.529,1.0,1.0,1.0
2021-08-09 00:01:07+02:00,3e8b54ba-efd2-48ea-b948-7104f540d930,1.429,1.659,1.599,1.0,0.0,0.0
2021-08-09 00:01:07+02:00,dfc32f99-aa65-4fd4-b16e-a2cc53f9be15,1.429,1.669,1.609,1.0,1.0,1.0
2021-08-09 00:01:07+02:00,a1af2da1-8095-43ff-9640-7c0260c00048,1.419,1.609,0.0,1.0,0.0,0.0
2021-08-09 00:01:07+02:00,4ffec7ae-f7c2-4da5-a2c4-5395dd03f833,1.389,1.619,1.559,1.0,1.0,1.0
2021-08-09 00:01:07+02:00,4c7b7a7f-d642-4bcd-8d29-eec72cbbd2a2,,,,,,
2021-08-09 00:01:07+02:00,15e3dded-c0f3-4fe0-8eff-f9ad6e28e6a8,,,,,,
2021-08-09 00:01:07+02:00,57f207da-80a0-4e61-b98d-d6eef0d79cfe,,,,,,
2021-08-09 00:01:07+02:00,a7b0746c-8c29-42b6-a7a0-5abcc6c177ea,,,,,,


In [None]:
multi_index = pd.MultiIndex.from_product([timestamps, stations], names=['Date', 'Station'])
# df = pd.merge(df, pds, how='left', on=['date', 'station_uuid']).set_index(['date', 'station_uuid'])
df = pd.DataFrame(index=multi_index)

df