In [4]:
import pandas as pd
import os

In [9]:
# loop through raw_data folder, read one file at a time, aggregate by day and save to processed_data folder

input_folder = '/workspace/VoltWise/Data_Ingestion/raw_hourly_data'
output_folder = '/workspace/VoltWise/Data_Ingestion/daily_data'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Get a list of input files in the input folder
input_files = os.listdir(input_folder)

In [34]:
for file in input_files:
    input_path = os.path.join(input_folder, file)
    output_file = file.replace('.csv', '_processed.csv')
    output_file = output_file.split('.')[0] + '.parquet'
    output_path = os.path.join(output_folder, output_file)


    # Read the input CSV file
    df = pd.read_parquet(input_path)
    

    df.drop(columns=['region'],inplace=True)
    # Convert 'date' and 'hour' columns to datetime format
    df['datetime'] = pd.to_datetime(df['Date'] + 'H' + df['Hour'],format='%Y-%m-%dH%H')
    df.drop(columns=['Date','Hour'],inplace=True)


    # # Group by date and calculate the cumulative sum of 'value'
    daily_df = df.groupby(df['datetime'].dt.date)[['Demand', 'Net generation']].sum().reset_index()
    

    # # Rename the columns to match the desired schema
    daily_df.columns = ['Date', 'Demand', 'Net_generation']

    daily_df['Date'] = pd.to_datetime(daily_df['Date'])
    

    # # Save the processed data to the output CSV file
    daily_df.to_parquet(output_path)

    print(f"Processed file: {file}. Output saved to: {output_path}")


Processed file: CAL.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/CAL.parquet
Processed file: CAR.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/CAR.parquet
Processed file: CENT.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/CENT.parquet
Processed file: FLA.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/FLA.parquet
Processed file: MIDA.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/MIDA.parquet
Processed file: MIDW.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/MIDW.parquet
Processed file: NE.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/NE.parquet
Processed file: NY.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/NY.parquet
Processed file: SE.parquet. Output saved to: /workspace/VoltWise/Data_Ingestion/daily_data/SE.parquet
Processed file: SW.parquet. Output saved to: /workspace/VoltWise