In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px

# Load data
df = pd.read_csv('data/benin-malanville.csv', parse_dates=['Timestamp'])

# Summary statistics
print(df.describe())
print('Missing >5%:', (df.isna().sum() / len(df) * 100)[(df.isna().sum() / len(df) * 100) > 5])

# Outlier detection
key_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[key_cols].apply(stats.zscore)
outliers = (z_scores.abs() > 3).any(axis=1)
print(f'Outliers: {outliers.sum()}')

# Clean data
for col in key_cols:
    df[col].fillna(df[col].median(), inplace=True)
df_clean = df[~outliers].copy()
df_clean.to_csv('data/benin_clean.csv', index=False)

# Time series plot
plt.figure(figsize=(12, 6))
for col in ['GHI', 'DNI', 'DHI', 'Tamb']:
    plt.plot(df_clean['Timestamp'], df_clean[col], label=col)
plt.title('Solar Irradiance and Temperature')
plt.legend()
plt.show()