## Import
After manually copying the content from matainance table into a new one `copy.xlsx`, import the sheet by read_excel

In [11]:
import pandas as pd

file_path = '/home/sdc/DR_DemandForecast/GencoData/AGOP_2024_copy.xlsx'
df = pd.read_excel(file_path, skiprows=4, usecols='A:M')
#df.head()
    

Unnamed: 0,GENERATING UNIT,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,POWERSERAYA LTD,,,,,,,,,,,,
1,SER G1,01/01-----------------------------------------...,,,,,,,,,,,
2,SER G2,01/01-----------------------------------------...,,,,14/05------------------------------17/06,,,,,,,
3,SER G3,,,,10/04-----------------------------13/05,,,,,,,,
4,SERCCP1,14/01----------14/01,,04/03----------04/03\n12/03----------14/03,,03/05--------------05/05,,25/07----------------02/08,,,,,


## Get Undermaintanance data

In [13]:
import pandas as pd
from datetime import datetime, timedelta
import re

def generate_date_range(start_date, end_date):
    return [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

def parse_maintenance_periods(maintenance_str):
    periods = re.findall(r'(\d{2}/\d{2})-{2,}(\d{2}/\d{2})', maintenance_str)
    date_ranges = []
    for start_str, end_str in periods:
        start_date = datetime.strptime(f"{start_str}/2024", '%d/%m/%Y')
        end_date = datetime.strptime(f"{end_str}/2024", '%d/%m/%Y')
        date_ranges.extend(generate_date_range(start_date, end_date))
    return date_ranges

# Initialize a new DataFrame for the desired output
new_data = {'Date': [], 'Generating Unit': [], 'Under Maintenance': []}

# Iterate over each row
for index, row in df.iterrows():
    generator_unit = row['GENERATING UNIT']
    # Iterate over each month
    for month in df.columns[1:]:
        maintenance_str = row[month]
        # If there is a maintenance period
        if pd.notna(maintenance_str):
            # Parse the maintenance periods and add to the new data structure
            date_ranges = parse_maintenance_periods(maintenance_str)
            #print(date_ranges)
            for date in date_ranges:
                new_data['Date'].append(date)
                new_data['Generating Unit'].append(generator_unit)
                new_data['Under Maintenance'].append(1)

# Create a new DataFrame from the new data structure
maintenance_df = pd.DataFrame(new_data)
maintenance_df


Unnamed: 0,Date,Generating Unit,Under Maintenance
0,2024-01-01,SER G1,1
1,2024-01-02,SER G1,1
2,2024-01-03,SER G1,1
3,2024-01-04,SER G1,1
4,2024-01-05,SER G1,1
...,...,...,...
2515,2024-07-16,LTMS1,1
2516,2024-07-17,LTMS1,1
2517,2024-07-18,LTMS1,1
2518,2024-07-19,LTMS1,1


## Transform the data

Construct dataframe by date and generating unit.

In [14]:
# Initialize a DataFrame for the whole year with a row for each day
all_dates = pd.date_range(start='2024-01-01', end='2024-12-31')
year_df = pd.DataFrame(all_dates, columns=['Date'])

# Convert 'Generating Unit' into column headers and fill with True where maintenance is scheduled
pivot_df = maintenance_df.pivot_table(index='Date', columns='Generating Unit', values='Under Maintenance', aggfunc='first', fill_value=0)

# Reindex the pivot table to include all dates in the year, filling non-maintenance days with False
#pivot_df = pivot_df.reindex(all_dates, fill_value=False)

#  Reset the index to have 'Date' as a column again
pivot_df.reset_index(inplace=True)

pivot_df.rename(columns={'index': 'Date'}, inplace=True)
# Remove the name of the column index
pivot_df.columns.name = None
# Strip leading and trailing whitespace from the column names
pivot_df.columns = pivot_df.columns.str.strip()
pivot_df

Unnamed: 0,Date,TSRBLK1,EXON G3,EXON G4,EXON GA,EXON GB,HRSG 21,HRSG 22,JUR GT1,JUR GT2,...,TMUC G2,TUAS WTE,TUASCCP1,TUASCCP2,TUASCCP3,TUASCCP4,TUASCCP5,TUASGEN,TWTE G1,TWTE G2
0,2024-01-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,2024-01-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,2024-01-03,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
3,2024-01-04,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4,2024-01-05,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2024-12-27,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
362,2024-12-28,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
363,2024-12-29,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364,2024-12-30,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# save the csv file for review
csv_file_path = '/home/sdc/DR_DemandForecast/GencoData/gen_maintain_view.csv'
pivot_df.to_csv(csv_file_path, index=False)