In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv('gridwatch_data.csv')

# Display basic info about the dataset
print(data.info())

# Data Cleaning - Handling missing values
data.fillna(0, inplace=True)  # Replace missing values with zero or apply other strategies

# Data Transformation - Converting timestamps
data['date'] = pd.to_datetime(data[' timestamp'])

# Sorting data by date to ensure correct time-series processing
data = data.sort_values('date')

# Create a new column for total electricity generation by summing up individual generation sources
# Ensure you replace these column names with the actual columns in your dataset
generation_sources = [' coal', ' nuclear',' ccgt', ' wind', ' pumped', ' hydro', ' biomass', ' oil', ' solar']  # List of columns representing different generation sources
data['total_generation'] = data[generation_sources].sum(axis=1)
 

# Adding new features: Monthly and Yearly Averages, Seasonal Features
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year

# Seasonal feature based on months
def season_of_date(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

data['season'] = data['date'].apply(season_of_date)

# Create rolling averages to smooth out short-term fluctuations
data['30_day_avg_generation'] = data['total_generation'].rolling(window=30).mean()

# Save the cleaned and transformed data to a new CSV file
# data.to_csv('cleaned_gridwatch_data.csv', index=False)

In [None]:
print(data.columns)

In [None]:

# Load the dataset
df = pd.read_csv('cleaned_gridwatch_data.csv')
print(df.columns)

In [4]:

# Exploratory Data Analysis
# Plot the overall energy generation trends
plt.figure(figsize=(12, 6))
plt.plot(df['date'], df['total_generation'], label='Daily Generation')
plt.plot(df['date'], df['30_day_avg_generation'], color='red', label='30-Day Rolling Average')
plt.title('UK Renewable Energy Generation Over Time')
plt.xlabel('Date')
plt.ylabel('Energy Generation (MW)')
plt.legend()
plt.show()

KeyboardInterrupt: 

In [None]:


# Distribution of generation by season
plt.figure(figsize=(8, 5))
sns.boxplot(x='season', y='generation', data=df)
plt.title('Energy Generation Distribution by Season')
plt.show()

In [None]:


# Yearly Trend in Energy Generation
yearly_avg = df.groupby('year')['generation'].mean()

plt.figure(figsize=(10, 6))
plt.plot(yearly_avg.index, yearly_avg.values, marker='o', linestyle='-', color='green')
plt.title('Average Annual Renewable Energy Generation')
plt.xlabel('Year')
plt.ylabel('Average Generation (MW)')
plt.show()

In [None]:


# Correlation Matrix
corr = df[['generation', 'month', 'year']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Time Series Analysis
# Decompose the time series
result = seasonal_decompose(df.set_index('date')['generation'], model='additive', period=365)
result.plot()
plt.show()

# Check for stationarity using Augmented Dickey-Fuller Test
from statsmodels.tsa.stattools import adfuller

adf_test = adfuller(df['generation'])
print('ADF Statistic:', adf_test[0])
print('p-value:', adf_test[1])

# If p-value < 0.05, we can conclude that the time series is stationary


In [None]:
# Prepare data for modeling
# Select features: we can use 'month', 'year', and 'season' for predictive modeling
df = pd.get_dummies(df, columns=['season'], drop_first=True)  # One-hot encode the season feature

# Define features and target variable
X = df[['month', 'year', 'season_Spring', 'season_Summer', 'season_Winter']]
y = df['generation']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance using RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

# Plot the predictions vs actual values
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Generation')
plt.plot(y_pred, label='Predicted Generation', color='red')
plt.title('Actual vs Predicted Renewable Energy Generation')
plt.xlabel('Test Sample')
plt.ylabel('Energy Generation (MW)')
plt.legend()
plt.show()
