In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pyarrow
import random

from pathlib import Path
import os
from collections import deque
from config.paths import ROOT_DIR, SAMPLE_DIR, PRICES_DIR, META_DIR

import src.fileutils as files
import src.visualization as viz
import src.process as process

In [2]:
stations_info_file = ROOT_DIR / 'data' / 'stations.csv'
sample_file_location = SAMPLE_DIR
sample_price_location = SAMPLE_DIR / 'prices'

RSEED = 42
random.seed(RSEED)
np.random.seed(RSEED)

In [3]:
#prices_df_raw = pd.read_csv(files.pick_random_csv(PRICES_DIR, random_state=RSEED))
prices_df_raw = pd.read_csv(PRICES_DIR / '2014' / '06' / '2014-06-08-prices.csv')
dus_stations = pd.read_csv(SAMPLE_DIR / 'stations' / 'stations_dus_plus.csv')



display(prices_df_raw.sample(3, random_state=RSEED))
display(viz.nice_summary(prices_df_raw))

Unnamed: 0,date,station_uuid,diesel,e5,e10,dieselchange,e5change,e10change
10581,2014-06-08 09:50:01+02,e1a15081-2630-9107-e040-0b0a3dfe563c,1.299,1.519,1.479,1,1,1
16051,2014-06-08 12:06:01+02,51d4b46d-a095-1aa0-e100-80009459e03a,1.289,1.509,1.469,1,1,1
20006,2014-06-08 16:58:01+02,6caf8cd8-ca47-4e5e-bcbb-f5e16db190f4,1.399,1.609,1.569,1,1,1


Unnamed: 0,Columns,Dtype,nunique,Non-Null Count,Missing,Missing %,Zero Count,mean,std,min,25%,50%,75%,max
0,date,object,173,21069,-,-,0,-,-,-,-,-,-,-
1,station_uuid,object,12017,21069,-,-,0,-,-,-,-,-,-,-
2,diesel,float64,62,21069,-,-,0,1.36,0.06,-0.0,1.32,1.35,1.38,1.53
3,e5,float64,58,21069,-,-,0,1.56,0.14,-0.0,1.53,1.56,1.59,1.73
4,e10,float64,55,21069,-,-,0,1.49,0.26,-0.0,1.49,1.51,1.55,2.0
5,dieselchange,int64,3,21069,-,-,1086,1.01,0.42,0.0,1.0,1.0,1.0,3.0
6,e5change,int64,3,21069,-,-,1574,0.99,0.45,0.0,1.0,1.0,1.0,3.0
7,e10change,int64,3,21069,-,-,1963,0.97,0.47,0.0,1.0,1.0,1.0,3.0


In [4]:
# Create a set of all UUIDs in the DUS subsample
dus_station_uuid = set(dus_stations.uuid)

# Drop the 'change' columns for now as they dont provide us with any insight. FUTURE FEATURE ENGINEERING
prices_df = prices_df_raw.drop(columns=prices_df_raw.filter(like='change').columns)
prices_df = prices_df[prices_df.station_uuid.isin(dus_station_uuid)]

df = process.extend_panel(prices_df)
df = process.swap_sort_index(df)

# IF FIRST ROW EMPTY, USE PRICE FROM PREVIOUS DAY 'CLOSING_PRICES.CSV'

df[['diesel', 'e5', 'e10']] = df.groupby(level='station')[['diesel', 'e5', 'e10']].fillna(method='ffill')

In [5]:
# SAFE THIS TO A FILE

df.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,diesel,e5,e10
station,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 09:50:01+02:00,1.289,1.519,1.479
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 09:54:01+02:00,1.289,1.519,1.479
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 10:00:54+02:00,1.289,1.519,1.479
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 10:22:01+02:00,1.289,1.519,1.479
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 10:30:01+02:00,1.289,1.519,1.479
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 10:46:01+02:00,1.289,1.519,1.479
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 10:54:01+02:00,1.289,1.509,1.469
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 10:58:01+02:00,1.289,1.509,1.469
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 11:14:01+02:00,1.289,1.509,1.469
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 11:22:01+02:00,1.289,1.509,1.469


In [6]:
closing_price = df.groupby(level='station').tail(1)
print(prices_df.station_uuid.nunique())
closing_price = closing_price.reset_index(level=1)
display(closing_price)



101


Unnamed: 0_level_0,date,diesel,e5,e10
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
005056ba-7cb6-1ed2-bceb-7e82e4910d2a,2014-06-08 18:30:01+02:00,1.399,1.609,1.569
005056ba-7cb6-1ed2-bceb-7ef561844d2a,2014-06-08 18:30:01+02:00,1.399,1.609,1.569
005056ba-7cb6-1ed2-bceb-80c585ca6d2b,2014-06-08 18:30:01+02:00,1.399,1.609,1.569
005056ba-7cb6-1ed2-bceb-82ea369c0d2d,2014-06-08 18:30:01+02:00,1.299,1.519,1.479
005056ba-7cb6-1ed2-bceb-87f71ccd4d30,2014-06-08 18:30:01+02:00,1.289,1.519,1.479
...,...,...,...,...
fa6624d4-7bb9-4b17-9e56-31e0040428d1,2014-06-08 18:30:01+02:00,1.469,1.669,1.629
fcdaddc5-7dc1-49f9-8286-71e8664f9e17,2014-06-08 18:30:01+02:00,1.299,1.529,1.489
fd99c048-3b6b-4943-8b93-838daefba76b,2014-06-08 18:30:01+02:00,1.469,1.669,1.629
fdc30c82-9cdf-4fd9-a000-bc2bfe0fc7bf,2014-06-08 18:30:01+02:00,1.299,1.519,1.479


In [8]:
# ADD PRICE CHANGES PER DAY FOR EACH STATION TO THE CLOSING TABLE
# CONVERT DATE TO ONLY DAY-DATE
# APPEND TO THE EXISTING 'CLOSING_PRICES.CSV'
from pathlib import Path
import pandas as pd
import os
from collections import deque

def save_closing_prices(df, file_path, date='date'):
    file_path = Path(file_path)
    
    # If the file doesn't exist, write the DataFrame to a new CSV file
    if not file_path.is_file():        
        df.to_csv(file_path, index=True)

    # If it does exist, compare the last line of the CSV File with the last line of the DataFrame df
    else:
        with open(file_path, "r") as file:
            last_line = deque(file, 1)[0]

        # Making sure the lines format is comparable 
        # CURRENTLY ONLY WORKS WITH DATA ON COLUMN INDEX 1
        old_timestamp = pd.to_datetime(last_line.split(',')[1])
        new_timestamp = pd.to_datetime(df[date].max())
        
        # If the new data is not already in the CSV File, append the DataFrame and safe the CSV file.
        if new_timestamp <= old_timestamp:
            print("Some data already exists in the CSV file. Data was not appended.")
        else:
            df.to_csv(file_path, mode='a', header=False, index=True)


closing_prices_path = META_DIR / 'closing_prices.csv'
save_closing_prices(closing_price, closing_prices_path)

Some data already exists in the CSV file. Data was not appended.


In [None]:
def get_meta_table(df)

In [15]:
# CREATE A METATABLE WITH DAILY SUMMARY:
# - ACTIVE STATIONS
# - NUMBER OF TIMESTAMPS
def add_to_meta_table():
    # open closing table file
    # append daily meta DataFrame
    # save file
    pass

In [16]:
# active_stations = prices_data09.station_uuid.unique()
# active_stations_sample = np.random.choice(active_stations, size=100)
# pds = prices_df.query('station_uuid in @active_stations_sample')
# pds



# create a table that carries all stations for each hour of the day

# group by the hour of the day, take the average price if a station is occuring more than once during that time

# if a station occurs, check the *change columns if its a 2 or a 3, and if yes, check if the price is actually different from the previous hour of if prices have just been re-reported

# if prices changed then make a 1 in the price-changed-dummies

# if there are multiple occurences of the same station within one hour, check which prices changed and make en entry for the respective dummy

# if there are multiple occurences of the same station within one hour, for each of the 3 fuel prices, count how often it changed

# take a batch for each hour of the day

# check 

# df = pd.merge(df, pds, how='left', on=['date', 'station_uuid']).set_index(['date', 'station_uuid'])