In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import plotly.express as px
import os

# Set Matplotlib style with fallback
try:
    plt.style.use('seaborn-v0_8')  # Modern seaborn-compatible style
except OSError:
    plt.style.use('ggplot')  # Fallback style

sns.set_palette("husl")

# Load data (make sure path is relative to current script or full path)
df = pd.read_csv(r'C:\Users\daniel.shobe\Desktop\schoolify\AI\solar-challenge-week1\data\sierraleone-bumbuna.csv', parse_dates=['Timestamp'])



# 1. Summary Statistics & Missing-Value Report
print("Summary Statistics:")
print(df.describe())
print("\nMissing Values:")
missing = df.isna().sum()
print(missing)
print("\nColumns with >5% Missing Values:")
missing_pct = (missing / len(df)) * 100
print(missing_pct[missing_pct > 5])

# 2. Outlier Detection & Basic Cleaning
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df[key_columns] = df[key_columns].apply(pd.to_numeric, errors='coerce')

# Compute Z-scores
z_scores = df[key_columns].apply(zscore, nan_policy='omit')
outliers = (z_scores.abs() > 3).any(axis=1)
print(f"Number of outlier rows (|Z|>3): {outliers.sum()}")

# Impute missing values with median for key columns
for col in key_columns:
    df[col].fillna(df[col].median(), inplace=True)

# Drop rows with missing Timestamp
df.dropna(subset=['Timestamp'], inplace=True)

# Export cleaned DataFrame
os.makedirs('data', exist_ok=True)
cleaned_path = 'data/sierraleone-bumbuna_clean.csv'
df.to_csv(cleaned_path, index=False)
print(f"Cleaned DataFrame exported to {cleaned_path}")

# 3. Time Series Analysis
df['Month'] = df['Timestamp'].dt.month
df['Hour'] = df['Timestamp'].dt.hour

# Daily Time Series Plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
for ax, col in zip(axes.flatten(), ['GHI', 'DNI', 'DHI', 'Tamb']):
    df.groupby(df['Timestamp'].dt.date)[col].mean().plot(ax=ax, title=f'Daily Average {col}')
    ax.set_xlabel('Date')
    ax.set_ylabel(col)
plt.tight_layout()
plt.savefig('sierraleone_time_series.png')
plt.close()

# Monthly Trends
fig, ax = plt.subplots(figsize=(10, 6))
df.groupby('Month')[['GHI', 'DNI', 'DHI']].mean().plot(kind='bar', ax=ax)
plt.title('Monthly Average Solar Irradiance')
plt.xlabel('Month')
plt.ylabel('Irradiance (W/m²)')
plt.savefig('sierraleone_monthly_trends.png')
plt.close()

# 4. Cleaning Impact
if 'Cleaning' in df.columns:
    cleaning_impact = df.groupby('Cleaning')[['ModA', 'ModB']].mean()
    fig, ax = plt.subplots(figsize=(8, 6))
    cleaning_impact.plot(kind='bar', ax=ax)
    plt.title('Average ModA & ModB Pre/Post Cleaning')
    plt.xlabel('Cleaning (0 = No, 1 = Yes)')
    plt.ylabel('Measurement (W/m²)')
    plt.savefig('sierraleone_cleaning_impact.png')
    plt.close()

# 5. Correlation & Scatter Plots
fig, ax = plt.subplots(figsize=(10, 8))
correlation_matrix = df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
plt.title('Correlation Heatmap')
plt.savefig('sierraleone_correlation_heatmap.png')
plt.close()

# Scatter plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.scatterplot(data=df, x='WS', y='GHI', ax=axes[0])
sns.scatterplot(data=df, x='WSgust', y='GHI', ax=axes[1])
sns.scatterplot(data=df, x='WD', y='GHI', ax=axes[2])
plt.tight_layout()
plt.savefig('sierraleone_scatter_plots.png')
plt.close()

# 6. Wind & Distribution Analysis
if 'WD' in df.columns:
    wd_bins = pd.cut(df['WD'], bins=16, labels=range(16))
    ws_mean = df.groupby(wd_bins)['WS'].mean()
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'projection': 'polar'})
    theta = np.linspace(0, 2 * np.pi, 16, endpoint=False)
    ax.bar(theta, ws_mean, width=2 * np.pi / 16)
    plt.title('Wind Rose (Average Wind Speed by Direction)')
    plt.savefig('sierraleone_wind_rose.png')
    plt.close()

# Histograms
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(df['GHI'], bins=30, ax=axes[0])
axes[0].set_title('GHI Distribution')
sns.histplot(df['WS'], bins=30, ax=axes[1])
axes[1].set_title('Wind Speed Distribution')
plt.tight_layout()
plt.savefig('sierraleone_histograms.png')
plt.close()

# 7. Temperature Analysis
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=df, x='RH', y='Tamb', size='GHI', hue='GHI')
plt.title('RH vs. Tamb with GHI Bubble Size')
plt.savefig('sierraleone_rh_tamb_scatter.png')
plt.close()

# 8. Bubble Chart with Plotly
fig = px.scatter(df, x='GHI', y='Tamb', size='RH', color='RH',
                 title='GHI vs. Tamb with RH Bubble Size')
fig.update_layout(showlegend=True)
fig.update_traces(marker=dict(sizemode='area', sizeref=df['RH'].max() / 100))
fig.write_html('sierraleone_bubble_chart.html')


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\daniel.shobe\\Desktop\\schoolify\\AI\\data\\sierraleone-bumbuna.csv'