In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import datetime as dt
import pyarrow
import random

from pathlib import Path
import os
from collections import deque
from src.config.paths import ROOT_DIR, SAMPLE_DIR, PRICES_DIR, META_DIR

import src.fileutils as files
import src.visualization as viz
import src.process as process

In [2]:
stations_info_file = ROOT_DIR / 'data' / 'stations.csv'
sample_file_location = SAMPLE_DIR
sample_price_location = SAMPLE_DIR / 'prices'

RSEED = 42
random.seed(RSEED)
np.random.seed(RSEED)

In [3]:
# 2014\10\2014-10-26-prices.csv
# 2015\03\2015-03-29-prices.csv
# 2016\05\2016-05-01-prices.csv
# 2018\03\2018-03-25-prices.csv
# 2018\10\2018-10-28-prices.csv
# 2020\03\2020-03-29-prices.csv
# 2020\10\2020-10-25-prices.csv
# 2021\03\2021-03-28-prices.csv
# 2021\10\2021-10-31-prices.csv
# 2022\03\2022-03-27-prices.csv
# 2022\10\2022-10-30-prices.csv
# 2023\03\2023-03-26-prices.csv

In [4]:
def get_closing_prices(prices_df):

    return prices_df.groupby(level='station').tail(1)


def impute_closing_prices(new_prices, closing_prices):

    opening_prices = new_prices.groupby(level='station').head(1).reset_index(level=1)
    opening_prices = opening_prices.fillna(closing_prices.reset_index(level=1))

    # set the datetime index back to where it was and update the new prices with the opening prices
    opening_prices = opening_prices.set_index('date', append=True)
    new_prices.update(opening_prices, overwrite = False)
    return new_prices


def fill_missing_prices(prices_df, method='ffill'):
    prices_df[['diesel', 'e5', 'e10']] = prices_df \
        .groupby(level='station')[['diesel', 'e5', 'e10']] \
        .fillna(method=method)
    
    return prices_df

####################################

# INSTANTIATE EVERYTHING OF THIS
prices_meta = pd.DataFrame()
closing_prices = pd.DataFrame()
last_closing_prices = pd.DataFrame()


# Read Data
prices_df_raw = pd.read_csv(PRICES_DIR / '2020' / '03' / '2020-03-29-prices.csv')
prices_df_raw2 = pd.read_csv(PRICES_DIR / '2020' / '03' / '2020-03-30-prices.csv')
dus_stations = pd.read_csv(SAMPLE_DIR / 'stations' / 'stations_dus_plus.csv')

# Create a set of all UUIDs in the DUS subsample
dus_station_uuid = set(dus_stations.uuid)

####################################
# PROCESS DAY 1

# Drop the 'change' columns for now as they dont provide us with any insight. FUTURE FEATURE ENGINEERING
# First Processing Step for Day 1: Drop all but DUS, generate panel
prices_df = prices_df_raw.drop(columns=prices_df_raw.filter(like='change').columns)
prices_df = prices_df[prices_df.station_uuid.isin(dus_station_uuid)]

df = process.extend_panel(prices_df)
df = process.swap_sort_index(df)

if not last_closing_prices.empty:
        df = impute_closing_prices(df, last_closing_prices)
df = fill_missing_prices(df)

####################################
# POSTPROCESS DAY 1: GENERATE METADATA FOR THAT DAY

last_closing_prices = get_closing_prices(df)
# closing_prices.append_last_closing_prices()

####################################
# PROCESS DAY 2

# First Processing Step for Day 2: Drop all but DUS, generate panel
prices_df2 = prices_df_raw2.drop(columns=prices_df_raw2.filter(like='change').columns)
prices_df2 = prices_df2[prices_df2.station_uuid.isin(dus_station_uuid)]

df2 = process.extend_panel(prices_df2)
df2 = process.swap_sort_index(df2)

if not last_closing_prices.empty:
        df2 = impute_closing_prices(df2, last_closing_prices)
df2 = fill_missing_prices(df2)


In [5]:
# Features:
# relative prices
# opening hours + dummies
# holiday dummies
# school-holyday dummies
# crude oil
# with bins: change count/hour

# Meta
# Average Price per day (per product)
# Trade Frequency
# was this a holiday
# was this a schoolholiday
# year
# month
# day
# weekday
# average crude oil price that day

# Processing
# bin dates
# Split into 3 prices (more data but faster processing maybe?)
# make additional features independent  at first


In [35]:
#closing_prices = pd.DataFrame()
closing_prices = pd.concat([closing_prices, last_closing_prices], axis=0)
display(closing_prices)
closing_prices.tail(1).index.get_level_values(1)[0].date()

Unnamed: 0_level_0,Unnamed: 1_level_0,diesel,e5,e10
station,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2020-03-29 23:59:08+02:00,1.079,1.209,1.179
005056ba-7cb6-1ed2-bceb-7ef561844d2a,2020-03-29 23:59:08+02:00,1.079,1.219,1.189
005056ba-7cb6-1ed2-bceb-80c585ca6d2b,2020-03-29 23:59:08+02:00,1.079,1.209,1.179
005056ba-7cb6-1ed2-bceb-82ea369c0d2d,2020-03-29 23:59:08+02:00,1.079,1.219,1.189
005056ba-7cb6-1ed2-bceb-87f71ccd4d30,2020-03-29 23:59:08+02:00,1.079,1.219,1.189
...,...,...,...,...
fa6624d4-7bb9-4b17-9e56-31e0040428d1,2020-03-29 23:59:08+02:00,1.159,1.279,1.249
fbcdf8a7-b6ba-4ec3-ac4c-dde2f0f29934,2020-03-29 23:59:08+02:00,1.079,1.209,1.179
fcdaddc5-7dc1-49f9-8286-71e8664f9e17,2020-03-29 23:59:08+02:00,1.039,1.199,1.169
fd99c048-3b6b-4943-8b93-838daefba76b,2020-03-29 23:59:08+02:00,1.139,1.269,1.239


datetime.date(2020, 3, 29)

In [41]:
meta_dir = META_DIR
suffix = 'meta'

metadata = {
            'prices_metadata': closing_prices,
            'closing_prices' : df,
        }


if suffix:
    suffix = f"_{suffix}"
else:
    suffix = ''

for file_name, data in metadata.items():
    file_path = Path(meta_dir / f'{file_name}{suffix}.csv')
    print(file_path)

D:\repos\jurassic-juice-juggler\data_processed\stations\prices_metadata_meta.csv
D:\repos\jurassic-juice-juggler\data_processed\stations\closing_prices_meta.csv


In [7]:
# ADD PRICE CHANGES PER DAY FOR EACH STATION TO THE CLOSING TABLE
# CONVERT DATE TO ONLY DAY-DATE
# APPEND TO THE EXISTING 'CLOSING_PRICES.CSV'
from pathlib import Path
import pandas as pd
import os
from collections import deque

def save_closing_prices(df, file_path, date='date'):
    file_path = Path(file_path)
    
    # If the file doesn't exist, write the DataFrame to a new CSV file
    if not file_path.is_file():        
        df.to_csv(file_path, index=True)

    # If it does exist, compare the last line of the CSV File with the last line of the DataFrame df
    else:
        with open(file_path, "r") as file:
            last_line = deque(file, 1)[0]

        # Making sure the lines format is comparable 
        # CURRENTLY ONLY WORKS WITH DATE ON COLUMN INDEX 1
        old_timestamp = pd.to_datetime(last_line.split(',')[1])
        new_timestamp = pd.to_datetime(df[date].max())
        
        # If the new data is not already in the CSV File, append the DataFrame and safe the CSV file.
        if new_timestamp <= old_timestamp:
            print("Some data already exists in the CSV file. Data was not appended.")
        else:
            df.to_csv(file_path, mode='a', header=False, index=True)


closing_prices_path = META_DIR / 'closing_prices.csv'
save_closing_prices(closing_price, closing_prices_path)

NameError: name 'closing_price' is not defined

In [None]:
def get_meta_table(df):
    # if csv exists, just open that
    # if not create a new pd.DataFrame
    # add #observations
    # add 
    pass

In [None]:
# CREATE A METATABLE WITH DAILY SUMMARY:
# - ACTIVE STATIONS
# - NUMBER OF TIMESTAMPS
def add_to_meta_table():
    # open closing table file
    # append daily meta DataFrame
    # save file
    pass

In [None]:
# active_stations = prices_data09.station_uuid.unique()
# active_stations_sample = np.random.choice(active_stations, size=100)
# pds = prices_df.query('station_uuid in @active_stations_sample')
# pds



# create a table that carries all stations for each hour of the day

# group by the hour of the day, take the average price if a station is occuring more than once during that time

# if a station occurs, check the *change columns if its a 2 or a 3, and if yes, check if the price is actually different from the previous hour of if prices have just been re-reported

# if prices changed then make a 1 in the price-changed-dummies

# if there are multiple occurences of the same station within one hour, check which prices changed and make en entry for the respective dummy

# if there are multiple occurences of the same station within one hour, for each of the 3 fuel prices, count how often it changed

# take a batch for each hour of the day

# check 

# df = pd.merge(df, pds, how='left', on=['date', 'station_uuid']).set_index(['date', 'station_uuid'])