In [1]:
import pandas as pd
import os
import sys
import subprocess
import papermill as pm
import time
# Step 1: Helper function for dataset inspection
def dataset_overview(df, name):
    print("="*40)
    print(f"Dataset Overview: {name}")
    print("="*40)
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nShape (rows x columns):", df.shape)
    
    print("\nData Types:")
    print(df.dtypes)
    
    print("\nMissing Values Per Column:")
    print(df.isnull().sum())
    
    print("\nDescriptive Statistics:")
    print(df.describe(include='all'))
    
    if 'datetime' in df.columns:
        print("\nTime Range:")
        print("   Start:", df['datetime'].min())
        print("   End:  ", df['datetime'].max())

# Step 2: Load all CSVs
print("\nLoading CSV files...\n")
consumption = pd.read_csv("bengaluru_consumption.csv", parse_dates=["datetime"])
solar = pd.read_csv("bengaluru_solar.csv", parse_dates=["datetime"])
weather = pd.read_csv("bengaluru_weather.csv", parse_dates=["datetime"])

# Step 3: Initial Inspection
dataset_overview(consumption, "Consumption")
dataset_overview(solar, "Solar")
dataset_overview(weather, "Weather")

# Step 4: Set datetime as index for resampling
consumption.set_index("datetime", inplace=True)
solar.set_index("datetime", inplace=True)
weather.set_index("datetime", inplace=True)

# Step 5: Resample to hourly intervals
print("\nResampling all datasets to 1-hour frequency using mean aggregation...")
consumption_hourly = consumption.resample("1h").mean()
solar_hourly = solar.resample("1h").mean()
weather_hourly = weather.resample("1h").mean()

# Step 6: Show sample of resampled data
print("\nSample from resampled data:")
print("\nConsumption (hourly):\n", consumption_hourly.head(2))
print("\nSolar (hourly):\n", solar_hourly.head(2))
print("\nWeather (hourly):\n", weather_hourly.head(2))

# Step 7: Merge all datasets
print("\nMerging all datasets on timestamp...")
merged_df = consumption_hourly.join(solar_hourly, how="outer")
merged_df = merged_df.join(weather_hourly, how="outer")

# Step 8: Check for missing values after merging
print("\nMissing values before cleaning:")
print(merged_df.isnull().sum())

# Step 9: Handle missing values
print("\nFilling missing values using forward fill and then backward fill...")
merged_df_cleaned = merged_df.ffill().bfill()

# Step 10: Verify no missing values remain
print("\nMissing values after cleaning:")
print(merged_df_cleaned.isnull().sum())

# Step 11: Save final cleaned dataset
output_path = "merged_energy_data_cleaned.csv"
merged_df_cleaned.to_csv(output_path)
print(f"\nCleaned dataset saved to: {output_path}")

# Step 12: Preview final dataset
print("\nFinal Cleaned Dataset Preview:")
print(merged_df_cleaned.head())



# Automated to trigger week 2 script
base_dir = os.getcwd()
parent_path = os.path.dirname(base_dir)
script_to_run_path = os.path.join(parent_path, parent_path+"\Phase2", "Capstone-Week-2.ipynb")
pm.execute_notebook(script_to_run_path,"Capstone-Week-2.ipynb",kernel_name="conda-base-py")
# Automated to trigger week 3,4 script
script_to_run_pathx = os.path.join(parent_path, parent_path+"\Phase2", "Capstone-Week3-Week4.ipynb")
pm.execute_notebook(script_to_run_pathx,"Capstone-Week3-Week4.ipynb",kernel_name="conda-base-py")
# Automated to trigger week 5 script
script_to_run_pathxx = os.path.join(parent_path, parent_path+"\Phase3\Week5", "Energy_Analysis.ipynb")
pm.execute_notebook(script_to_run_pathxx,"Energy_Analysis.ipynb",kernel_name="conda-base-py")
# Automated to trigger dashboard
d1 = subprocess.Popen([
    r"C:\Users\chnds\anaconda3\Scripts\streamlit.EXE",
    "run",
    r"C:\Users\chnds\Downloads\Capstone_Proj_Final\Capstone_Project\Phase3\Week6\dashboard1.py",
    "--server.port", "8501"
])

d2 = subprocess.Popen([
    r"C:\Users\chnds\anaconda3\Scripts\streamlit.EXE",
    "run",
    r"C:\Users\chnds\Downloads\Capstone_Proj_Final\Capstone_Project\Phase3\Week6\dashboard2.py",
    "--server.port", "8502"
])
time.sleep(1800)
d1.terminate()
d2.terminate()


Loading CSV files...

Dataset Overview: Consumption

First 5 rows:
             datetime  consumption_kwh
0 2018-01-01 00:00:00         1.667690
1 2018-01-01 03:00:00         1.675790
2 2018-01-01 06:00:00         1.812862
3 2018-01-01 09:00:00         1.435569
4 2018-01-01 12:00:00         0.933961

Shape (rows x columns): (14608, 2)

Data Types:
datetime           datetime64[ns]
consumption_kwh           float64
dtype: object

Missing Values Per Column:
datetime             0
consumption_kwh    730
dtype: int64

Descriptive Statistics:
                  datetime  consumption_kwh
count                14608     13878.000000
mean   2020-07-01 22:30:00         1.504266
min    2018-01-01 00:00:00         0.577567
25%    2019-04-02 11:15:00         1.308658
50%    2020-07-01 22:30:00         1.499950
75%    2021-10-01 09:45:00         1.692983
max    2022-12-31 21:00:00         2.809345
std                    NaN         0.281255

Time Range:
   Start: 2018-01-01 00:00:00
   End:   2022-1

Executing:   0%|          | 0/1 [00:00<?, ?cell/s]

Executing:   0%|          | 0/2 [00:00<?, ?cell/s]

Executing:   0%|          | 0/2 [00:00<?, ?cell/s]