In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


import plotly.express as px


sns.set(style="whitegrid", palette="muted")
plt.rcParams["figure.figsize"] = (12, 6)


: 

In [None]:
# --- Load Data ---
country = "benin-malanville"  
data_path = f"../data/{country}.csv"

df = pd.read_csv(data_path)

# Show first 5 rows
df.head()


In [None]:
# --- Basic Info ---
print("Shape:", df.shape)
df.info()


display(df.describe())

missing = df.isna().sum().sort_values(ascending=False)
print("Missing values:\n", missing)
print("\nColumns with >5% missing:\n", missing[missing > 0.05 * len(df)])


In [None]:
# Columns to check for outliers
key_cols = ["GHI", "DNI", "DHI", "ModA", "ModB", "WS", "WSgust"]

# Compute Z-scores
z_scores = np.abs(stats.zscore(df[key_cols], nan_policy='omit'))
outliers = (z_scores > 3).any(axis=1)
print(f"Number of outlier rows detected: {outliers.sum()}")


df_clean = df[~outliers].copy()

# Impute remaining missing values in key columns with median
df_clean[key_cols] = df_clean[key_cols].fillna(df_clean[key_cols].median())


df_clean.to_csv(f"../data/{country}_clean.csv", index=False)
print(f"Cleaned data saved to ../data/{country}_clean.csv")


In [None]:
# Convert timestamp to datetime
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])
df_clean.set_index('Timestamp', inplace=True)

# Plot time series for solar radiation and temperature
cols_to_plot = ["GHI", "DNI", "DHI", "Tamb"]
df_clean[cols_to_plot].plot(subplots=True, figsize=(14,10), title=f"{country.capitalize()} Solar & Temp Trends")
plt.show()


In [None]:
# Average module readings pre/post-clean
clean_avg = df_clean.groupby("Cleaning")[["ModA", "ModB"]].mean()
clean_avg.plot(kind="bar", title=f"{country.capitalize()} Module Performance vs Cleaning")
plt.ylabel("Average Module Reading (W/m²)")
plt.show()


In [None]:
# Correlation heatmap
corr_cols = ["GHI", "DNI", "DHI", "TModA", "TModB", "Tamb"]
sns.heatmap(df_clean[corr_cols].corr(), annot=True, cmap="coolwarm")
plt.title(f"{country.capitalize()} Correlation Heatmap")
plt.show()

# Scatter plots examples
sns.scatterplot(data=df_clean, x="WS", y="GHI", alpha=0.5)
plt.title(f"{country.capitalize()} Wind Speed vs GHI")
plt.show()

sns.scatterplot(data=df_clean, x="RH", y="Tamb", alpha=0.5)
plt.title(f"{country.capitalize()} Relative Humidity vs Temperature")
plt.show()


In [None]:
# Histogram of GHI
df_clean["GHI"].hist(bins=30, alpha=0.7, label="GHI")
df_clean["WS"].hist(bins=30, alpha=0.7, label="Wind Speed")
plt.legend()
plt.title(f"{country.capitalize()} Distribution of GHI and WS")
plt.show()

# Wind direction vs speed scatter
sns.scatterplot(data=df_clean, x="WD", y="WS", alpha=0.5)
plt.title(f"{country.capitalize()} Wind Direction vs Speed")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df_clean["Tamb"], df_clean["GHI"], 
            s=df_clean["RH"]*0.5, alpha=0.5, c=df_clean["BP"], cmap="viridis")
plt.xlabel("Ambient Temperature (°C)")
plt.ylabel("Global Horizontal Irradiance (GHI)")
plt.title(f"{country.capitalize()} GHI vs Temperature (Bubble size = RH, Color = BP)")
plt.colorbar(label="Barometric Pressure (hPa)")
plt.show()
