# Open Data PVNet (ODP) Community Notebook - Improved Version

This notebook demonstrates enhanced solar forecasting using the Open Data PVNet project with improved data handling and model evaluation.

In [None]:
# Environment setup and package verification
import sys
import warnings
warnings.filterwarnings('ignore')

required_packages = [
    'pandas',
    'numpy',
    'scikit-learn',
    'matplotlib',
    'xarray'
]

print(f"Python version: {sys.version}\n")
print("Checking required packages:")
for package in required_packages:
    try:
        __import__(package)
        print(f"✓ {package} installed")
    except ImportError:
        print(f"✗ {package} missing - please install it")

In [None]:
# Improved data generation with realistic patterns
import pandas as pd
import numpy as np
import datetime as dt

def generate_solar_pattern(n_samples, max_power=1000):
    """Generate realistic solar generation pattern with daily cycle."""
    time = np.linspace(0, 2*np.pi, 48)  # 48 points for 24 hours
    base_pattern = np.maximum(0, np.sin(time)) * max_power
    patterns = []
    
    for _ in range(n_samples // 48 + 1):
        # Add random variations
        noise = np.random.normal(0, max_power*0.1, len(base_pattern))
        daily_pattern = np.maximum(0, base_pattern + noise)
        patterns.extend(daily_pattern)
    
    return patterns[:n_samples]

# Create sample data with realistic patterns
end_time = dt.datetime.now().replace(minute=0, second=0, microsecond=0)
start_time = end_time - dt.timedelta(days=7)  # One week of data
n_samples = 7 * 48  # 7 days of 30-min data

df = pd.DataFrame({
    'datetime_gmt': pd.date_range(start=start_time, end=end_time, freq='30min'),
    'generation_mw': generate_solar_pattern(n_samples)
})

print("Sample PV Generation Data:")
print(df.head())
print(f"\nDataset spans {(df['datetime_gmt'].max() - df['datetime_gmt'].min()).days} days")
print(f"Shape of dataset: {df.shape}")

# Visualize the generation pattern
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(df['datetime_gmt'], df['generation_mw'])
plt.title('Solar Generation Pattern')
plt.xlabel('Time')
plt.ylabel('Generation (MW)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Enhanced feature engineering
def create_weather_features(n_samples):
    """Create synthetic weather features with realistic patterns."""
    time = np.linspace(0, 2*np.pi, 48)  # Daily cycle
    
    # Temperature pattern (daily cycle + noise)
    temp_base = 20 + 5 * np.sin(time)
    temperature = np.tile(temp_base, n_samples // 48 + 1)[:n_samples]
    temperature += np.random.normal(0, 2, n_samples)
    
    # Cloud cover (inverse correlation with generation)
    cloud_base = 50 - 30 * np.sin(time)
    cloud_cover = np.tile(cloud_base, n_samples // 48 + 1)[:n_samples]
    cloud_cover += np.random.normal(0, 10, n_samples)
    cloud_cover = np.clip(cloud_cover, 0, 100)
    
    # Solar radiation (similar pattern to generation)
    radiation = generate_solar_pattern(n_samples, max_power=1200)
    
    return pd.DataFrame({
        'temperature': temperature,
        'cloud_cover': cloud_cover,
        'radiation': radiation
    })

# Create features
features_df = create_weather_features(len(df))
df = pd.concat([df, features_df], axis=1)

# Add time-based features
df['hour'] = df['datetime_gmt'].dt.hour
df['is_daytime'] = (df['hour'] >= 6) & (df['hour'] <= 18)

print("Feature Statistics:")
print(df.describe())

In [None]:
# Improved model training and evaluation
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Prepare features
feature_columns = ['temperature', 'cloud_cover', 'radiation', 'hour', 'is_daytime']
X = df[feature_columns]
y = df['generation_mw']

# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = {'mae': [], 'rmse': [], 'r2': []}

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    cv_scores['mae'].append(mean_absolute_error(y_test, y_pred))
    cv_scores['rmse'].append(np.sqrt(mean_squared_error(y_test, y_pred)))
    cv_scores['r2'].append(r2_score(y_test, y_pred))

print("Cross-validation Results:")
for metric, scores in cv_scores.items():
    print(f"{metric.upper()}: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

# Feature importance visualization
importance = dict(zip(feature_columns, model.feature_importances_))
plt.figure(figsize=(10, 6))
plt.bar(importance.keys(), importance.values())
plt.title('Feature Importance')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Forecast visualization
final_train_idx = test_idx[0]
plt.figure(figsize=(15, 6))

# Plot actual values
plt.plot(df['datetime_gmt'].iloc[test_idx], y_test, 
         label='Actual', alpha=0.7)

# Plot predictions
plt.plot(df['datetime_gmt'].iloc[test_idx], y_pred, 
         label='Predicted', alpha=0.7)

plt.title('Solar Generation: Actual vs Predicted')
plt.xlabel('Time')
plt.ylabel('Generation (MW)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()