In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pyarrow
import random

from pathlib import Path
import os
from collections import deque
from src.config.paths import ROOT_DIR, SAMPLE_DIR, PRICES_DIR, META_DIR

import src.fileutils as files
import src.visualization as viz
import src.process as process

In [2]:
stations_info_file = ROOT_DIR / 'data' / 'stations.csv'
sample_file_location = SAMPLE_DIR
sample_price_location = SAMPLE_DIR / 'prices'

RSEED = 42
random.seed(RSEED)
np.random.seed(RSEED)

In [3]:
# 2014\10\2014-10-26-prices.csv
# 2015\03\2015-03-29-prices.csv
# 2016\05\2016-05-01-prices.csv
# 2018\03\2018-03-25-prices.csv
# 2018\10\2018-10-28-prices.csv
# 2020\03\2020-03-29-prices.csv
# 2020\10\2020-10-25-prices.csv
# 2021\03\2021-03-28-prices.csv
# 2021\10\2021-10-31-prices.csv
# 2022\03\2022-03-27-prices.csv
# 2022\10\2022-10-30-prices.csv
# 2023\03\2023-03-26-prices.csv

In [16]:
#prices_df_raw = pd.read_csv(files.pick_random_csv(PRICES_DIR, random_state=RSEED))
prices_df_raw = pd.read_csv(PRICES_DIR / '2016' / '05' / '2016-05-01-prices.csv')
dus_stations = pd.read_csv(SAMPLE_DIR / 'stations' / 'stations_dus_plus.csv')



display(prices_df_raw.sample(3, random_state=RSEED))
display(viz.nice_summary(prices_df_raw))

Unnamed: 0,date,station_uuid,diesel,e5,e10,dieselchange,e5change,e10change
36563,2016-05-01 12:02:01+02,ac8fd271-e9ed-448c-9bbd-77f02d5bb80f,1.109,1.339,1.319,1,1,1
122702,2016-05-01 23:13:01+02,288137d4-d841-4777-b936-492bb6e07d5d,1.119,1.369,1.349,1,1,1
4818,2016-05-01 06:29:01+02,880e5d9f-64b7-4ae1-91fa-2e75589cb2ff,1.109,1.349,1.329,1,1,1


Unnamed: 0,Columns,Dtype,nunique,Non-Null Count,Missing,Missing %,Zero Count,mean,std,min,25%,50%,75%,max
0,date,object,1312,125913,-,-,0,-,-,-,-,-,-,-
1,station_uuid,object,14686,125913,-,-,0,-,-,-,-,-,-,-
2,diesel,float64,135,125913,-,-,56,1.08,0.06,-0.0,1.05,1.08,1.11,2.0
3,e5,float64,128,125913,-,-,1252,1.31,0.15,-0.0,1.29,1.32,1.35,2.0
4,e10,float64,136,125913,-,-,3825,1.27,0.23,-0.0,1.26,1.3,1.33,2.0
5,dieselchange,int64,4,125913,-,-,15154,1.11,0.76,0.0,1.0,1.0,1.0,3.0
6,e5change,int64,4,125913,-,-,13660,1.12,0.75,0.0,1.0,1.0,1.0,3.0
7,e10change,int64,4,125913,-,-,15901,1.1,0.75,0.0,1.0,1.0,1.0,3.0


In [17]:
# Create a set of all UUIDs in the DUS subsample
dus_station_uuid = set(dus_stations.uuid)

# Drop the 'change' columns for now as they dont provide us with any insight. FUTURE FEATURE ENGINEERING
prices_df = prices_df_raw.drop(columns=prices_df_raw.filter(like='change').columns)
prices_df = prices_df[prices_df.station_uuid.isin(dus_station_uuid)]

display(viz.nice_summary(prices_df))
display(prices_df.head(5))
display(prices_df.tail(5))

Unnamed: 0,Columns,Dtype,nunique,Non-Null Count,Missing,Missing %,Zero Count,mean,std,min,25%,50%,75%,max
0,date,object,385,1116,-,-,0,-,-,-,-,-,-,-
1,station_uuid,object,117,1116,-,-,0,-,-,-,-,-,-,-
2,diesel,float64,34,1116,-,-,0,1.08,0.06,0.99,1.04,1.07,1.11,1.29
3,e5,float64,33,1116,-,-,0,1.32,0.07,-0.0,1.28,1.31,1.35,1.52
4,e10,float64,31,1116,-,-,11,1.29,0.14,0.0,1.26,1.29,1.33,1.49


Unnamed: 0,date,station_uuid,diesel,e5,e10
585,2016-05-01 03:59:01+02,ee34d5a4-be1a-2a3a-e040-0b0a3dfe5d3f,1.069,1.309,1.289
669,2016-05-01 04:06:01+02,a0fe47f7-1f70-4786-95d2-edfcdfbe25f1,1.029,1.279,1.259
912,2016-05-01 05:02:01+02,44afad52-0417-459c-9951-8ba9457c2092,1.229,1.469,1.449
943,2016-05-01 05:02:01+02,748b6d98-64f6-4030-86c7-26887a915eb3,1.229,1.479,1.459
950,2016-05-01 05:02:01+02,7d09ea41-77b4-4e0a-b2e1-5611b990ca3c,1.229,1.469,1.449


Unnamed: 0,date,station_uuid,diesel,e5,e10
125040,2016-05-01 23:28:01+02,1fe898c5-3a58-4660-aa86-eab4c751a041,1.069,1.319,1.299
125078,2016-05-01 23:29:01+02,19a63af8-7a51-434f-b16e-162b8785a562,1.119,1.369,1.349
125087,2016-05-01 23:29:01+02,1fe898c5-3a58-4660-aa86-eab4c751a041,1.069,1.319,1.299
125220,2016-05-01 23:30:01+02,5bf85d09-ea6b-4146-b23f-4b902e2e1554,1.079,1.319,1.299
125311,2016-05-01 23:33:01+02,5bf85d09-ea6b-4146-b23f-4b902e2e1554,1.079,1.319,1.299


In [19]:
df = process.extend_panel(prices_df)
df = process.swap_sort_index(df)

# IF FIRST ROW EMPTY, USE PRICE FROM PREVIOUS DAY 'CLOSING_PRICES.CSV'

df[['diesel', 'e5', 'e10']] = df.groupby(level='station')[['diesel', 'e5', 'e10']].fillna(method='ffill')

ValueError: cannot handle a non-unique multi-index!

In [12]:
# SAFE THIS TO A FILE

type(df.index.get_level_values(1))

pandas.core.indexes.datetimes.DatetimeIndex

In [13]:
closing_price = df.groupby(level='station').tail(1)
print(prices_df.station_uuid.nunique())
closing_price = closing_price.reset_index(level=1)
display(closing_price)

115


Unnamed: 0_level_0,date,diesel,e5,e10
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2016-05-02 21:34:01+00:00,0.999,1.249,1.229
005056ba-7cb6-1ed2-bceb-7ef561844d2a,2016-05-02 21:34:01+00:00,1.059,1.319,1.299
005056ba-7cb6-1ed2-bceb-80c585ca6d2b,2016-05-02 21:34:01+00:00,1.059,1.319,1.299
005056ba-7cb6-1ed2-bceb-82ea369c0d2d,2016-05-02 21:34:01+00:00,1.069,1.319,1.299
005056ba-7cb6-1ed2-bceb-87f71ccd4d30,2016-05-02 21:34:01+00:00,1.079,1.329,1.309
...,...,...,...,...
fbcdf8a7-b6ba-4ec3-ac4c-dde2f0f29934,2016-05-02 21:34:01+00:00,1.159,1.399,1.379
fcdaddc5-7dc1-49f9-8286-71e8664f9e17,2016-05-02 21:34:01+00:00,1.179,1.429,1.409
fd99c048-3b6b-4943-8b93-838daefba76b,2016-05-02 21:34:01+00:00,1.159,1.399,1.379
fdc30c82-9cdf-4fd9-a000-bc2bfe0fc7bf,2016-05-02 21:34:01+00:00,1.059,1.289,1.269


In [14]:
# ADD PRICE CHANGES PER DAY FOR EACH STATION TO THE CLOSING TABLE
# CONVERT DATE TO ONLY DAY-DATE
# APPEND TO THE EXISTING 'CLOSING_PRICES.CSV'
from pathlib import Path
import pandas as pd
import os
from collections import deque

def save_closing_prices(df, file_path, date='date'):
    file_path = Path(file_path)
    
    # If the file doesn't exist, write the DataFrame to a new CSV file
    if not file_path.is_file():        
        df.to_csv(file_path, index=True)

    # If it does exist, compare the last line of the CSV File with the last line of the DataFrame df
    else:
        with open(file_path, "r") as file:
            last_line = deque(file, 1)[0]

        # Making sure the lines format is comparable 
        # CURRENTLY ONLY WORKS WITH DATE ON COLUMN INDEX 1
        old_timestamp = pd.to_datetime(last_line.split(',')[1])
        new_timestamp = pd.to_datetime(df[date].max())
        
        # If the new data is not already in the CSV File, append the DataFrame and safe the CSV file.
        if new_timestamp <= old_timestamp:
            print("Some data already exists in the CSV file. Data was not appended.")
        else:
            df.to_csv(file_path, mode='a', header=False, index=True)


closing_prices_path = META_DIR / 'closing_prices.csv'
save_closing_prices(closing_price, closing_prices_path)

OSError: Cannot save file into a non-existent directory: 'D:\repos\jurassic-juice-juggler\data_processed\stations'

In [None]:
def get_meta_table(df):
    # if csv exists, just open that
    # if not create a new pd.DataFrame
    # add #observations
    # add 
    pass

In [15]:
# CREATE A METATABLE WITH DAILY SUMMARY:
# - ACTIVE STATIONS
# - NUMBER OF TIMESTAMPS
def add_to_meta_table():
    # open closing table file
    # append daily meta DataFrame
    # save file
    pass

In [16]:
# active_stations = prices_data09.station_uuid.unique()
# active_stations_sample = np.random.choice(active_stations, size=100)
# pds = prices_df.query('station_uuid in @active_stations_sample')
# pds



# create a table that carries all stations for each hour of the day

# group by the hour of the day, take the average price if a station is occuring more than once during that time

# if a station occurs, check the *change columns if its a 2 or a 3, and if yes, check if the price is actually different from the previous hour of if prices have just been re-reported

# if prices changed then make a 1 in the price-changed-dummies

# if there are multiple occurences of the same station within one hour, check which prices changed and make en entry for the respective dummy

# if there are multiple occurences of the same station within one hour, for each of the 3 fuel prices, count how often it changed

# take a batch for each hour of the day

# check 

# df = pd.merge(df, pds, how='left', on=['date', 'station_uuid']).set_index(['date', 'station_uuid'])