In [1]:
import pandas as pd
import os
import numpy as np

In [14]:
# loop through raw_data folder, read one file at a time, aggregate by day and save to processed_data folder

input_folder = '/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet'
output_folder = '/workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet_str'

# Create the output folder if it doesn't exist
# os.makedirs(output_folder, exist_ok=True)

# Get a list of input files in the input folder
input_files = os.listdir(input_folder)

In [16]:
for file in input_files:
    input_path = os.path.join(input_folder, file)
    #output_file = file.replace('.csv', '.parquet')

    output_path = os.path.join(output_folder, file)

    print(input_path)

    # Read the input CSV file
    df = pd.read_parquet(input_path)
    
    # Convert 'date' and 'hour' columns to datetime format
    df['datetime'] = pd.to_datetime(df['Date'] + 'H' + df['Hour'].astype(str),format='%Y-%m-%dH%H')
    df.drop(columns=['Date','Hour'],inplace=True)

    # Conditionally populate Export and Import columns
    df['Export'] = np.where(df['Total interchange'] > 0, df['Total interchange'], 0)
    df['Import'] = np.where(df['Total interchange'] < 0, -df['Total interchange'], 0)


    # # Group by date and calculate the cumulative sum of 'value'
    daily_df = df.groupby(df['datetime'].dt.date)[['Demand', 'Net generation','Day-ahead demand forecast','Import','Export']].sum().reset_index()
    
    # # Rename the columns to match the desired schema
    daily_df.columns = ['Date', 'Demand', 'Net generation','Day-ahead demand forecast','Import','Export']

    daily_df['Date'] = pd.to_datetime(daily_df['Date'])
    
    # # Save the processed data to the output CSV file
    daily_df.to_parquet(output_path)

    print(f"Processed file: {file}. Output saved to: {output_path}")


/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet/CAL.parquet
Processed file: CAL.parquet. Output saved to: /workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet_str/CAL.parquet
/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet/CAR.parquet
Processed file: CAR.parquet. Output saved to: /workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet_str/CAR.parquet
/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet/CENT.parquet
Processed file: CENT.parquet. Output saved to: /workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet_str/CENT.parquet
/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet/FLA.parquet
Processed file: FLA.parquet. Output saved to: /workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet_str/FLA.parquet
/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet/MIDA.parquet
Processed file: MIDA.parquet. Output saved to: /workspace/VoltWise/Lambda/Local_code/Data

In [28]:
old=pd.read_parquet('/workspace/VoltWise/Lambda/Local_code/Data/Daily/Old_data_parquet/CAL.parquet')
new=pd.read_parquet('/workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet/CAL.parquet')

In [30]:
new

Unnamed: 0,Date,Demand,Net generation,Day-ahead demand forecast,Import,Export
0,2015-07-01,916513.0,760720.0,945537.0,154832.0,0.0
1,2015-07-02,964443.0,814879.0,1020739.0,149652.0,0.0
2,2015-07-03,904339.0,757794.0,933741.0,146587.0,0.0
3,2015-07-04,863264.0,691081.0,853655.0,172444.0,0.0
4,2015-07-05,801034.0,625964.0,811006.0,175343.0,0.0
...,...,...,...,...,...,...
2927,2023-07-06,778303.0,770749.0,745949.0,24215.0,30435.0
2928,2023-07-07,761592.0,748684.0,738549.0,36594.0,30000.0
2929,2023-07-08,719822.0,696553.0,704952.0,45093.0,37518.0
2930,2023-07-09,681133.0,671556.0,677770.0,41499.0,22554.0


In [31]:
old

Unnamed: 0,Date,Demand,Net generation
0,2015-07-01,916513.0,760720.0
1,2015-07-02,964443.0,814879.0
2,2015-07-03,904339.0,757794.0
3,2015-07-04,863264.0,691081.0
4,2015-07-05,801034.0,625964.0
...,...,...,...
2927,2023-07-06,778303.0,770749.0
2928,2023-07-07,761592.0,748684.0
2929,2023-07-08,719822.0,696553.0
2930,2023-07-09,681055.0,671478.0


In [5]:
old_hourly=pd.read_parquet('/workspace/VoltWise/Lambda/Local_code/Data/Hourly/Old_data_parquet/CAL.parquet')
new_hourly=pd.read_parquet('/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet/CAL.parquet')

In [8]:
old_hourly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70368 entries, 0 to 70367
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            70368 non-null  object 
 1   Hour            70368 non-null  object 
 2   region          70368 non-null  object 
 3   Demand          70356 non-null  float64
 4   Net generation  70336 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.7+ MB


In [9]:
new_hourly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70368 entries, 0 to 70367
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       70368 non-null  object 
 1   Hour                       70368 non-null  int64  
 2   Demand                     70368 non-null  float64
 3   Net generation             70368 non-null  float64
 4   Day-ahead demand forecast  70368 non-null  float64
 5   Total interchange          70368 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 3.2+ MB


In [10]:
folder_path = '/workspace/VoltWise/Lambda/Local_code/Data/Hourly/New_data_parquet/'
output_path = '/workspace/VoltWise/Lambda/Local_code/Data/Hourly/Int_conv/New_data_parquet/'

# Get a list of all Parquet files in the folder
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

for file_name in parquet_files:
    # Read the Parquet file into a DataFrame
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_parquet(file_path)

    # Convert the 'Hour' column to str with a length of 2
    df['Hour'] = df['Hour'].apply(lambda x: str(x).zfill(2))

    # Write the updated DataFrame back to the Parquet file
    df.to_parquet(output_path+f'/{file_name}')

In [21]:
new=pd.read_parquet('/workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet_str/CAL.parquet')
old=pd.read_parquet('/workspace/VoltWise/Lambda/Local_code/Data/Daily/New_data_parquet/CAL.parquet')