# Sierra Leone – EDA

Exploratory data analysis for Sierra Leone irradiance dataset.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

sns.set_theme(style="whitegrid")
DATA_DIR = Path("../data").resolve()

CLEAN = DATA_DIR / "sierraleone_clean.csv"
RAW = DATA_DIR / "sierraleone-bumbuna.csv"

if CLEAN.exists():
    df = pd.read_csv(CLEAN)
else:
    df = pd.read_csv(RAW)
    # basic cleaning similar to other notebooks
    if "Timestamp" in df.columns:
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
    for col in ["GHI", "DNI", "DHI"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").clip(lower=0)
    df = df.dropna(subset=["GHI", "DNI", "DHI"], how="all").drop_duplicates()
    df.to_csv(CLEAN, index=False)

df.head()


In [None]:
df.describe(numeric_only=True).T


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4), sharex=False)
for ax, metric in zip(axes, ["GHI", "DNI", "DHI"]):
    if metric in df.columns:
        sns.boxplot(data=df, x=metric, ax=ax, color="#60a5fa")
        ax.set_title(f"{metric} distribution")
        ax.set_xlabel(f"{metric} (W/m²)")
plt.tight_layout()
plt.show()


In [None]:
if "Timestamp" in df.columns:
    plt.figure(figsize=(10, 3))
    sns.lineplot(data=df, x="Timestamp", y="GHI", linewidth=1)
    plt.title("GHI over time – Sierra Leone")
    plt.ylabel("GHI (W/m²)")
    plt.tight_layout()

