In [76]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import datetime as dt
import pyarrow
import random

from pathlib import Path
import os
from collections import deque
from src.config.paths import ROOT_DIR, SAMPLE_DIR, PRICES_DIR, META_DIR, PROCESSED_PRICES

import src.fileutils as files
import src.visualization as viz
import src.process as process
from src import process_prices

from src.process_files import PriceProcessor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
stations_info_file = ROOT_DIR / 'data' / 'stations.csv'
sample_file_location = SAMPLE_DIR
sample_price_location = SAMPLE_DIR / 'prices'

RSEED = 42

In [78]:
processor = PriceProcessor(PRICES_DIR, PROCESSED_PRICES, subset=[1,2], subset_index='3')
processor.subset

('3', {1, 2})

In [80]:
np.random.seed(RSEED)
import inspect

# Create a dataframe
df = pd.DataFrame({
    'datetime': [dt.datetime.now() - dt.timedelta(days=i) for i in range(10)],
    'string': ['string' + str(i) for i in range(10)],
    'integer': np.random.randint(0, 100, size=10),
    'float1': np.random.rand(10),
    'float2': np.random.rand(10),
})


# class return_one():
#     def __init__(self, data: pd.DataFrame, method=None):
#         self.one = 1
#         self.data = data
#         self.predefined_methods = {
#             'hourly': process_prices.make_hourly
#         }
#         self.set_method(method)
     
#     def set_method(self, method,):

#         if method is None:
#             self.method = None

#         elif type(method) == str:
#             if method not in self.predefined_methods:
#                 raise ValueError(f"{method} is not a a predefined method.")
#             self.method = self.predefined_methods[method]
        
#         elif inspect.isfunction(method):
#             params = inspect.signature(method).parameters.values()
#             if len(params) != 1:
#                 raise ValueError(f"{method} is required to take only 1 parameter but {len(params)} were given.")
#             self.method = method

#         else:
#             raise ValueError("Passed object is not a function or a predefined method")
    
#     def process_data(self, data: pd.DataFrame, method=None, **kwargs) -> pd.DataFrame:

#         if method:
#             self.set_method(method, **kwargs)
#         if self.method:
#             data = self.method(data, **kwargs)

#         else:
#             raise ValueError("The method process_data requires a method to be set, but None was given.")
#         return data
    
def some_function(data: pd.DataFrame)->pd.DataFrame:

    data = data.assign(
        float1 = lambda x: x['float1']*10
    )
    return data

def some_other_function(data: pd.DataFrame, multiplier = 10, add = 3)->pd.DataFrame:
    data = data.assign(
        float1 = lambda x: x['float1'] * multiplier + add
    )
    return data

other_kwargs = {'multiplier': 10000, 'add': 7}

df = processor.process_data(df, some_other_function, **other_kwargs)
df = processor.process_data(df, 'hourly')
df


Unnamed: 0_level_0,string,integer,float1,float2
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-05-13 01:57:25.783320,string9,74,3049.42243,0.199674
2023-05-14 01:57:25.783320,string8,74,1841.045099,0.785176
2023-05-15 01:57:25.783320,string7,86,1825.249672,0.45607
2023-05-16 01:57:25.783320,string6,82,2130.391107,0.366362
2023-05-17 01:57:25.783320,string5,20,8331.426408,0.292145
2023-05-18 01:57:25.783320,string4,60,9706.098522,0.139494
2023-05-19 01:57:25.783320,string3,71,212.844943,0.611853
2023-05-20 01:57:25.783320,string2,14,7087.725778,0.291229
2023-05-21 01:57:25.783320,string1,92,6018.150117,0.431945
2023-05-22 01:57:25.783320,string0,51,8668.761458,0.524756


In [6]:
# Features:
# relative prices
# opening hours + dummies
# holiday dummies
# school-holyday dummies
# crude oil
# with bins: change count/hour

# Meta
# Average Price per day (per product)
# Trade Frequency
# was this a holiday
# was this a schoolholiday
# year
# month
# day
# weekday
# average crude oil price that day

# Processing
# bin dates
# Split into 3 prices (more data but faster processing maybe?)
# make additional features independent  at first


In [7]:
# ADD PRICE CHANGES PER DAY FOR EACH STATION TO THE CLOSING TABLE
# CONVERT DATE TO ONLY DAY-DATE
# APPEND TO THE EXISTING 'CLOSING_PRICES.CSV'
from pathlib import Path
import pandas as pd
import os
from collections import deque

def save_closing_prices(df, file_path, date='date'):
    file_path = Path(file_path)
    
    # If the file doesn't exist, write the DataFrame to a new CSV file
    if not file_path.is_file():        
        df.to_csv(file_path, index=True)

    # If it does exist, compare the last line of the CSV File with the last line of the DataFrame df
    else:
        with open(file_path, "r") as file:
            last_line = deque(file, 1)[0]

        # Making sure the lines format is comparable 
        # CURRENTLY ONLY WORKS WITH DATE ON COLUMN INDEX 1
        old_timestamp = pd.to_datetime(last_line.split(',')[1])
        new_timestamp = pd.to_datetime(df[date].max())
        
        # If the new data is not already in the CSV File, append the DataFrame and safe the CSV file.
        if new_timestamp <= old_timestamp:
            print("Some data already exists in the CSV file. Data was not appended.")
        else:
            df.to_csv(file_path, mode='a', header=False, index=True)

In [8]:
def get_meta_table(df):
    # if csv exists, just open that
    # if not create a new pd.DataFrame
    # add #observations
    # add 
    pass

In [9]:
# CREATE A METATABLE WITH DAILY SUMMARY:
# - ACTIVE STATIONS
# - NUMBER OF TIMESTAMPS
def add_to_meta_table():
    # open closing table file
    # append daily meta DataFrame
    # save file
    pass

In [10]:
# active_stations = prices_data09.station_uuid.unique()
# active_stations_sample = np.random.choice(active_stations, size=100)
# pds = prices_df.query('station_uuid in @active_stations_sample')
# pds



# create a table that carries all stations for each hour of the day

# group by the hour of the day, take the average price if a station is occuring more than once during that time

# if a station occurs, check the *change columns if its a 2 or a 3, and if yes, check if the price is actually different from the previous hour of if prices have just been re-reported

# if prices changed then make a 1 in the price-changed-dummies

# if there are multiple occurences of the same station within one hour, check which prices changed and make en entry for the respective dummy

# if there are multiple occurences of the same station within one hour, for each of the 3 fuel prices, count how often it changed

# take a batch for each hour of the day

# check 

# df = pd.merge(df, pds, how='left', on=['date', 'station_uuid']).set_index(['date', 'station_uuid'])