In [None]:
# Exploratory Data Analysis (EDA) for Solar/Wind Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


# Load your data into df
# Summary Statistics & Missing-Value Report

df = pd.read_csv('../data/sierraleone-bumbuna.csv', parse_dates=['Timestamp'])

summary_stats = df.describe()
missing_report = df.isna().sum()
n_rows = len(df)
missing_pct = (missing_report / n_rows) * 100
cols_gt5pct_nulls = missing_pct[missing_pct > 5].index.tolist()

print("Summary Statistics:\n", summary_stats)
print("\nMissing Value Report:\n", missing_report)
print("\nColumns with >5% missing values:", cols_gt5pct_nulls)

# Outlier Detection & Basic Cleaning
key_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df_z = df[key_cols].apply(stats.zscore)
outlier_mask = (np.abs(df_z) > 3).any(axis=1)
print(f"\nNumber of outlier rows (|Z|>3): {outlier_mask.sum()}")

# Impute missing values in key columns with median
for col in key_cols:
    if df[col].isna().any():
        df[col].fillna(df[col].median(), inplace=True)

# Drop rows with outliers in key columns
df_clean = df[~outlier_mask].copy()

# Export cleaned DataFrame to CSV
df_clean.to_csv(f"../data/benin_clean.csv", index=False)

# Time Series Analysis
fig, axs = plt.subplots(2, 2, figsize=(16, 10))
df_clean.plot(x='Timestamp', y='GHI', ax=axs[0,0], title='GHI vs. Timestamp')
df_clean.plot(x='Timestamp', y='DNI', ax=axs[0,1], title='DNI vs. Timestamp')
df_clean.plot(x='Timestamp', y='DHI', ax=axs[1,0], title='DHI vs. Timestamp')
df_clean.plot(x='Timestamp', y='Tamb', ax=axs[1,1], title='Tamb vs. Timestamp')
plt.tight_layout()
plt.show()

Summary Statistics:
                            Timestamp            GHI            DNI  \
count                         525600  525600.000000  525600.000000   
mean   2022-04-30 12:00:30.000000768     201.957515     116.376337   
min              2021-10-30 00:01:00     -19.500000      -7.800000   
25%              2022-01-29 06:00:45      -2.800000      -0.300000   
50%              2022-04-30 12:00:30       0.300000      -0.100000   
75%              2022-07-30 18:00:15     362.400000     107.000000   
max              2022-10-30 00:00:00    1499.000000     946.000000   
std                              NaN     298.495150     218.652659   

                 DHI           ModA           ModB           Tamb  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      113.720571     206.643095     198.114691      26.319394   
min       -17.900000       0.000000       0.000000      12.300000   
25%        -3.800000       0.000000       0.000000      23.100000   
50%