## Import
After manually copying the content from matainance table into a new one `AGOP_2024_copy.xlsx`, import the sheet by read_excel

In [None]:
import os
from os.path import join
import pandas as pd
ROOT = '/home/sdc/DR_DemandForecast/GencoData'

file_path = join(ROOT, 'AGOP_2024_copy.xlsx')
df = pd.read_excel(file_path, skiprows=4, usecols='A:M')
#df.head()
    

## Get Undermaintanance data

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import re

def generate_date_range(start_date, end_date):
    return [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

def parse_maintenance_periods(maintenance_str):
    periods = re.findall(r'(\d{2}/\d{2})-{2,}(\d{2}/\d{2})', maintenance_str)
    date_ranges = []
    for start_str, end_str in periods:
        start_date = datetime.strptime(f"{start_str}/2024", '%d/%m/%Y')
        end_date = datetime.strptime(f"{end_str}/2024", '%d/%m/%Y')
        date_ranges.extend(generate_date_range(start_date, end_date))
    return date_ranges

# Initialize a new DataFrame for the desired output
new_data = {'Date': [], 'Generating Unit': [], 'Under Maintenance': []}

# Iterate over each row
for index, row in df.iterrows():
    generator_unit = row['GENERATING UNIT']
    # Iterate over each month
    for month in df.columns[1:]:
        maintenance_str = row[month]
        # If there is a maintenance period
        if pd.notna(maintenance_str):
            # Parse the maintenance periods and add to the new data structure
            date_ranges = parse_maintenance_periods(maintenance_str)
            #print(date_ranges)
            for date in date_ranges:
                new_data['Date'].append(date)
                new_data['Generating Unit'].append(generator_unit)
                new_data['Under Maintenance'].append(1)

# Create a new DataFrame from the new data structure
maintenance_df = pd.DataFrame(new_data)
#maintenance_df


## Transform the data

Construct dataframe by date and generating unit.

In [None]:
import datetime as dt

# Initialize a DataFrame for the whole year with a row for each day
all_dates = pd.date_range(start='2024-01-01', end='2024-12-31')
year_df = pd.DataFrame(all_dates, columns=['Date'])

# Convert 'Generating Unit' into column headers and fill with True where maintenance is scheduled
pivot_df = maintenance_df.pivot_table(index='Date', columns='Generating Unit', values='Under Maintenance', aggfunc='first', fill_value=0)

#  Reset the index to have 'Date' as a column again
pivot_df.reset_index(inplace=True)
pivot_df.rename(columns={'index': 'Date'}, inplace=True)

# Remove the name of the column index
pivot_df.columns.name = None

# Strip leading and trailing whitespace from the column names
pivot_df.columns = pivot_df.columns.str.strip()

pivot_df['Date'] = pd.to_datetime(pivot_df['Date']).dt.date
#pivot_df

In [None]:
# save the csv file for review
csv_file_path = join(ROOT, 'gen_maintain_view.csv')
pivot_df.to_csv(csv_file_path, index=False)

## Save to DB

In [None]:

from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load the environment variables from the .env file
env_file = join(ROOT, '.env')
load_dotenv(env_file)

# Get the values of host, user, pswd, db, and schema from the environment variables
DBHOST = os.getenv('host')
DBUSER = os.getenv('user')
DBPSWD = os.getenv('pswd')
DBNAME = os.getenv('db')
SCHEMA = 'public'

# Use the values as needed
engine = create_engine(f"postgresql://{DBUSER}:{DBPSWD}@{DBHOST}/{DBNAME}?options=-csearch_path%3D{SCHEMA}", echo=False)
conn = engine.connect()

In [None]:
pivot_df.to_sql('AGOP', con=conn, if_exists='replace', index=False)

In [None]:
conn.commit()
conn.close()