# Benin EDA
Quick profiling, cleaning, and export of cleaned CSV (local only).

In [None]:
import sys, os, pandas as pd, numpy as np
if 'src' not in sys.path: sys.path.append('src')
from ingest import load_all
import preprocess
# Load only Benin
df_all = load_all('data')
df = df_all[df_all['country']=='benin'].reset_index(drop=True)
df.head()

In [None]:
# Summary stats & missing
df = preprocess.quick_preprocess(df)
display(df.describe(include='all'))
df.isna().mean().sort_values(ascending=False).head(20)

In [None]:
# Z-score outliers for selected columns
cols = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']
def zf(d, cs, z=3.0):
    out = d.copy()
    for c in cs:
        if c in out:
            mu, sd = out[c].mean(), out[c].std(ddof=0)
            if pd.notnull(sd) and sd>0:
                out = out[(out[c]-mu).abs() <= z*sd]
    return out.reset_index(drop=True)
df_clean = zf(df, cols)
len(df), len(df_clean)

In [None]:
# Time series and distributions
import plotly.express as px
metric = 'GHI'
fig1 = px.line(df_clean.sort_values('Timestamp').head(5000), x='Timestamp', y=metric, title='Time Series (sample)')
fig2 = px.histogram(df_clean, x=metric, nbins=40, title='Histogram')
fig1.show(); fig2.show()

In [None]:
# Cleaning impact by 'Cleaning' flag on ModA/ModB
if 'Cleaning' in df_clean:
    display(df_clean.groupby('Cleaning')[['ModA','ModB']].mean())
# Correlation heatmap (key variables)
import seaborn as sns, matplotlib.pyplot as plt
sel = [c for c in ['GHI','DNI','DHI','TModA','TModB','Tamb','RH','WS','WD'] if c in df_clean]
plt.figure(figsize=(8,6)); sns.heatmap(df_clean[sel].corr(), annot=False, cmap='viridis'); plt.show()

In [None]:
# Export cleaned CSV (local only)
out_path = os.path.join('data','benin_clean.csv')
df_clean.to_csv(out_path, index=False)
out_path

## Notes & Observations (Benin)
- Highest average GHI among the three (â‰ˆ235.93). Strong potential.
- DNI also leads (â‰ˆ166.66). DHI comparable to Togo.
- Time series shows clear daytime peaks; distributions align with expected solar patterns.
- Cleaning likely improves module metrics (ModA/ModB); consider periodic maintenance.
- Proceed with site-level analysis for feasibility and infrastructure constraints.