In [21]:
CONFIG_FILE = "data/preprocess_config_v_0_1.py"

In [22]:
import json
import importlib
import os
import pandas as pd

# Load config file
config = importlib.import_module(CONFIG_FILE.replace("/", ".").replace(".py", ""))
importlib.reload(config)


<module 'data.preprocess_config_v_0_1' from '/Users/michaelbonon/Coding/stock-price-predictor/apps/stock-price-model/data/preprocess_config_v_0_1.py'>

In [23]:
# Check if config file has all the required attributes
print(dir(config))

required_attributes = [
    "DATA_VERSION_MAJOR",
    "DATA_VERSION_MINOR",
    "RAW_DATA_DIR",
    "RAW_DATA_FILENAMES",
    "OUTPUT_DIR",
    "TRANSFORMS",
    "CREATE_METADATA",
]

for attr in required_attributes:
    if not hasattr(config, attr):
        raise AttributeError(f"Attribute {attr} not found in config file.")
    
DATA_VERSION_MAJOR = config.DATA_VERSION_MAJOR
DATA_VERSION_MINOR = config.DATA_VERSION_MINOR
RAW_DATA_DIR = config.RAW_DATA_DIR
RAW_DATA_FILENAMES = config.RAW_DATA_FILENAMES
OUTPUT_DIR = config.OUTPUT_DIR
TRANSFORMS = config.TRANSFORMS
CREATE_METADATA = config.CREATE_METADATA

['CREATE_METADATA', 'DATA_VERSION_MAJOR', 'DATA_VERSION_MINOR', 'FEATURES', 'INDEX_COL', 'Metadata', 'OUTPUT_DIR', 'RAW_DATA_DIR', 'RAW_DATA_FILENAMES', 'REF_COLS', 'TARGET_COLS', 'TRANSFORMS', 'WINDOW_SIZE', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'create_metadata', 'create_windowed_dataset', 'pd', 'transform_data']


In [24]:
# Helper functions
def load_data(path: str):
    """
    Loads the data from the given path and converts the columns
    to the correct data types.
    """
    df = pd.read_csv(path, dtype=str)
    
    return df

def save_df_to_csv(df: pd.DataFrame, path: str):
    """
    Saves the dataframe to a CSV file.
    """
    df.to_csv(
        path, 
        float_format="%.2f", 
        index=False,
        mode="w",
    )

def preprocess_data(dir: str, filename: str, output_dir: str):
    """
    Preprocesses the data from the given path and saves the
    preprocessed data to a new file.
    """
    df = load_data(f"{dir}/{filename}")

    for transforms in TRANSFORMS:
        df = transforms(df)

    create_metadata = CREATE_METADATA
    metadata = create_metadata(df)

    version_dir = "v_{}_{}".format(DATA_VERSION_MAJOR, DATA_VERSION_MINOR)
    
    metadata_filepath = "{}/{}/{}".format(
        output_dir,
        version_dir,
        filename.replace(".csv", "_metadata.json"),
    )
    os.makedirs(os.path.dirname(metadata_filepath), exist_ok=True)

    with open(metadata_filepath, "w") as f:
        json.dump(metadata, f)
    
    filepath = "{}/{}/{}".format(
        output_dir,
        version_dir,
        filename
    )
    save_df_to_csv(df, filepath)

In [25]:
for filename in RAW_DATA_FILENAMES:
    preprocess_data(RAW_DATA_DIR, filename, OUTPUT_DIR)