In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from windrose import WindroseAxes

# Set styles
sns.set(style='whitegrid')

# 2. Load Data
df = pd.read_csv('benin-malanville.csv')  # adjust path if needed

# 3. Summary Statistics & Missing Values
print(df.describe())
missing_report = df.isna().sum()
print("\nMissing Values:\n", missing_report[missing_report > 0])

# Highlight >5% missing
missing_percent = df.isna().mean() * 100
print("\n>5% Missing Columns:\n", missing_percent[missing_percent > 5])

# 4. Outlier Detection with Z-score
cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[cols_to_check].apply(zscore)
outliers = (z_scores.abs() > 3).any(axis=1)
print(f"Outliers found: {outliers.sum()}")
df_clean = df[~outliers].copy()

# 5. Impute Missing Values with Median
for col in cols_to_check:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# 6. Save Cleaned CSV
df_clean.to_csv('benin-malanville-cleaned.csv', index=False)

# 7. Time Series Plot
plt.figure(figsize=(14, 6))
for col in ['GHI', 'DNI', 'DHI', 'Tamb']:
    sns.lineplot(data=df_clean, x='Timestamp', y=col, label=col)
plt.title('Benin - Irradiance and Temperature Over Time')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# 8. Cleaning Impact Analysis
if 'Cleaning' in df_clean.columns:
    df_clean.groupby('Cleaning')[['ModA', 'ModB']].mean().plot(kind='bar')
    plt.title('Cleaning Effect on ModA & ModB')
    plt.show()

# 9. Correlation Heatmap
sns.heatmap(df_clean[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 10. Wind Rose
ax = WindroseAxes.from_ax()
ax.bar(df_clean['WD'], df_clean['WS'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()

# 11. Histograms
plt.hist(df_clean['GHI'], bins=30, alpha=0.6, label='GHI')
plt.hist(df_clean['WS'], bins=30, alpha=0.6, label='WS')
plt.legend()
plt.title('Distribution of GHI and WS')
plt.show()

# 12. RH and Temperature
sns.scatterplot(data=df_clean, x='RH', y='Tamb', hue='GHI')
plt.title('RH vs Tamb colored by GHI')
plt.show()

# 13. Bubble Chart
plt.scatter(df_clean['GHI'], df_clean['Tamb'], s=df_clean['RH'], alpha=0.5)
plt.xlabel("GHI")
plt.ylabel("Tamb")
plt.title("GHI vs Tamb (Bubble size = RH)")
plt.show()


KeyboardInterrupt: 