In [None]:
# TOGO EDA & CLEANING - togo_eda.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import os

# Set plotting style
sns.set(style="whitegrid")

# Load the raw Togo data
data_path = "../data/raw/togo-dapaong_qc.csv"
df = pd.read_csv(data_path, parse_dates=['Timestamp'])

# Quick overview
print("Initial data shape:", df.shape)
print(df.info())
print(df.describe())

# Check missing values
missing = df.isna().sum()
print("\nMissing values per column:\n", missing)

# Columns with >5% missing
threshold = 0.05 * len(df)
print("\nColumns with more than 5% missing values:")
print(missing[missing > threshold])

# Outlier detection using z-score on selected columns
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
for col in columns_to_check:
    df[col + '_zscore'] = zscore(df[col].fillna(df[col].median()))

# Flag outliers (absolute z-score > 3)
outliers = (df[[c + '_zscore' for c in columns_to_check]].abs() > 3).any(axis=1)
print(f"Number of outlier rows detected: {outliers.sum()}")

# Remove outliers
df_clean = df.loc[~outliers].copy()
print("Data shape after outlier removal:", df_clean.shape)

# Impute missing values in key columns with median
for col in columns_to_check:
    median_val = df_clean[col].median()
    df_clean[col] = df_clean[col].fillna(median_val)

# Drop the z-score columns (not needed anymore)
df_clean.drop(columns=[c + '_zscore' for c in columns_to_check], inplace=True)

# Time series plots of key metrics
plt.figure(figsize=(15, 5))
plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI')
plt.plot(df_clean['Timestamp'], df_clean['DNI'], label='DNI')
plt.plot(df_clean['Timestamp'], df_clean['DHI'], label='DHI')
plt.xlabel('Timestamp')
plt.ylabel('Irradiance (W/m²)')
plt.title('Solar Irradiance over Time - Togo')
plt.legend()
plt.show()

plt.figure(figsize=(15, 5))
plt.plot(df_clean['Timestamp'], df_clean['Tamb'], color='orange')
plt.xlabel('Timestamp')
plt.ylabel('Ambient Temperature (°C)')
plt.title('Ambient Temperature over Time - Togo')
plt.show()

# Cleaning impact: average ModA and ModB before and after cleaning events
cleaning_group = df_clean.groupby('Cleaning')[['ModA', 'ModB']].mean()
print("\nAverage ModA and ModB grouped by Cleaning flag:")
print(cleaning_group)

cleaning_group.plot(kind='bar')
plt.title('Average ModA & ModB Pre/Post Cleaning - Togo')
plt.ylabel('Mean Irradiance (W/m²)')
plt.show()

# Correlation heatmap of selected variables
corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'Tamb', 'RH', 'WS', 'BP']
corr_matrix = df_clean[corr_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix - Togo')
plt.show()

# Scatter plots
sns.scatterplot(data=df_clean, x='WS', y='GHI')
plt.title('Wind Speed vs GHI - Togo')
plt.show()

sns.scatterplot(data=df_clean, x='RH', y='Tamb')
plt.title('Relative Humidity vs Ambient Temperature - Togo')
plt.show()

# Wind rose plot (simple polar plot)
plt.figure(figsize=(8, 8))
ax = plt.subplot(111, polar=True)
wd_rad = np.deg2rad(df_clean['WD'].dropna())
ws = df_clean.loc[wd_rad.index, 'WS']
sc = ax.scatter(wd_rad, ws, c=ws, cmap='viridis', alpha=0.75)
plt.colorbar(sc, label='Wind Speed (m/s)')
ax.set_title('Wind Direction and Speed - Togo')
plt.show()

# Histogram of GHI
plt.figure(figsize=(8, 5))
sns.histplot(df_clean['GHI'], bins=50, kde=True)
plt.title('Histogram of GHI - Togo')
plt.show()

# Bubble chart: GHI vs Tamb with bubble size = RH
plt.figure(figsize=(10, 6))
plt.scatter(df_clean['Tamb'], df_clean['GHI'], s=df_clean['RH']*2, alpha=0.5, c=df_clean['RH'], cmap='coolwarm')
plt.colorbar(label='Relative Humidity (%)')
plt.xlabel('Ambient Temperature (°C)')
plt.ylabel('GHI (W/m²)')
plt.title('GHI vs Temperature with RH bubble size - Togo')
plt.show()

# Save cleaned data
processed_dir = "../data/processed"
os.makedirs(processed_dir, exist_ok=True)
clean_file_path = os.path.join(processed_dir, "togo_clean.csv")
df_clean.to_csv(clean_file_path, index=False)
print(f"Cleaned data saved to {clean_file_path}")
