In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import plotly.express as px
import os

# Set plot style
sns.set_theme(style="darkgrid")  # or "whitegrid", "ticks", etc.

sns.set_palette("husl")

# Load data
df = pd.read_csv(r'C:\Users\daniel.shobe\Desktop\schoolify\AI\solar-challenge-week1\data\togo-dapaong_qc.csv', parse_dates=['Timestamp'])




# 1. Summary Statistics & Missing-Value Report
print("Summary Statistics:")
print(df.describe())
print("\nMissing Values:")
missing = df.isna().sum()
print(missing)
print("\nColumns with >5% Missing Values:")
missing_pct = (missing / len(df)) * 100
print(missing_pct[missing_pct > 5])

# 2. Outlier Detection & Basic Cleaning
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df[key_columns] = df[key_columns].apply(pd.to_numeric, errors='coerce')

# Compute Z-scores
z_scores = df[key_columns].apply(zscore, nan_policy='omit')
outliers = (z_scores.abs() > 3).any(axis=1)
print(f"Number of outlier rows (|Z|>3): {outliers.sum()}")

# Impute missing values with median for key columns
for col in key_columns:
    df[col].fillna(df[col].median(), inplace=True)

# Drop rows with missing Timestamp or critical columns
df.dropna(subset=['Timestamp'], inplace=True)

# Export cleaned DataFrame
os.makedirs('data', exist_ok=True)
df.to_csv('data/togo-dapaong-qc_clean.csv', index=False)
print("Cleaned DataFrame exported to data/togo-dapaong-qc_clean.csv")

# 3. Time Series Analysis
df['Month'] = df['Timestamp'].dt.month
df['Hour'] = df['Timestamp'].dt.hour

# Plot GHI, DNI, DHI, Tamb vs. Timestamp
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
for ax, col in zip(axes.flatten(), ['GHI', 'DNI', 'DHI', 'Tamb']):
    df.groupby(df['Timestamp'].dt.date)[col].mean().plot(ax=ax, title=f'Daily Average {col}')
    ax.set_xlabel('Date')
    ax.set_ylabel(col)
plt.tight_layout()
plt.savefig('togo_time_series.png')
plt.close()

# Monthly trends
fig, ax = plt.subplots(figsize=(10, 6))
df.groupby('Month')[['GHI', 'DNI', 'DHI']].mean().plot(kind='bar', ax=ax)
plt.title('Monthly Average Solar Irradiance')
plt.xlabel('Month')
plt.ylabel('Irradiance (W/m²)')
plt.savefig('togo_monthly_trends.png')
plt.close()

# 4. Cleaning Impact
cleaning_impact = df.groupby('Cleaning')[['ModA', 'ModB']].mean()
fig, ax = plt.subplots(figsize=(8, 6))
cleaning_impact.plot(kind='bar', ax=ax)
plt.title('Average ModA & ModB Pre/Post Cleaning')
plt.xlabel('Cleaning (0 = No, 1 = Yes)')
plt.ylabel('Measurement (W/m²)')
plt.savefig('togo_cleaning_impact.png')
plt.close()

# 5. Correlation & Relationship Analysis
fig, ax = plt.subplots(figsize=(10, 8))
correlation_matrix = df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
plt.title('Correlation Heatmap')
plt.savefig('togo_correlation_heatmap.png')
plt.close()

# Scatter plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.scatterplot(data=df, x='WS', y='GHI', ax=axes[0])
sns.scatterplot(data=df, x='WSgust', y='GHI', ax=axes[1])
sns.scatterplot(data=df, x='WD', y='GHI', ax=axes[2])
plt.tight_layout()
plt.savefig('togo_scatter_plots.png')
plt.close()

# 6. Wind & Distribution Analysis
wd_bins = pd.cut(df['WD'], bins=16, labels=range(16))
ws_mean = df.groupby(wd_bins)['WS'].mean()
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, 16, endpoint=False)
ax.bar(theta, ws_mean, width=2 * np.pi / 16)
plt.title('Wind Rose (Average Wind Speed by Direction)')
plt.savefig('togo_wind_rose.png')
plt.close()

# Histograms
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(df['GHI'], bins=30, ax=axes[0])
axes[0].set_title('GHI Distribution')
sns.histplot(df['WS'], bins=30, ax=axes[1])
axes[1].set_title('Wind Speed Distribution')
plt.tight_layout()
plt.savefig('togo_histograms.png')
plt.close()

# 7. Temperature Analysis
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=df, x='RH', y='Tamb', size='GHI', hue='GHI')
plt.title('RH vs. Tamb with GHI Bubble Size')
plt.savefig('togo_rh_tamb_scatter.png')
plt.close()

# 8. Bubble Chart
fig = px.scatter(df, x='GHI', y='Tamb', size='RH', color='RH',
                 title='GHI vs. Tamb with RH Bubble Size')
fig.write_xaxes(title='GHI (W/m²)')
fig.update_yaxes(title='Tamb (°C)')
fig.write({'showlegend': True})
fig.write_traces(marker=dict(sizemode='area', sizeref=df['RH'].max()/100))
fig.write('togo_bubble_chart.html')

Summary Statistics:
                           Timestamp            GHI            DNI  \
count                         525600  525600.000000  525600.000000   
mean   2022-04-25 12:00:30.000000768     230.555040     151.258469   
min              2021-10-25 00:01:00     -12.700000       0.000000   
25%              2022-01-24 06:00:45      -2.200000       0.000000   
50%              2022-04-25 12:00:30       2.100000       0.000000   
75%              2022-07-25 18:00:15     442.400000     246.400000   
max              2022-10-25 00:00:00    1424.000000    1004.500000   
std                              NaN     322.532347     250.956962   

                 DHI           ModA           ModB           Tamb  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      116.444352     226.144375     219.568588      27.751788   
min         0.000000       0.000000       0.000000      14.900000   
25%         0.000000       0.000000       0.000000      24.200000   
50% 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Cleaned DataFrame exported to data/togo-dapaong-qc_clean.csv


  ws_mean = df.groupby(wd_bins)['WS'].mean()


AttributeError: 'Figure' object has no attribute 'write_xaxes'