<a href="https://colab.research.google.com/github/mgondeck/wind_curtailment_prediction/blob/mary/baseline_binary_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Baseline binary classification model

In [84]:
# packages
import os
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the token file in your Google Drive
token_file_path = '/content/drive/My Drive/ignore/github_token.txt'

# Read the token from the file
with open(token_file_path, 'r') as file:
    github_token = file.read().strip()

# clone repo
!git clone https://mgondeck:{github_token}@github.com/mgondeck/wind_curtailment_prediction.git

# navigate in project folder
os.chdir("/content/wind_curtailment_prediction")

Mounted at /content/drive
Cloning into 'wind_curtailment_prediction'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 37 (delta 13), reused 19 (delta 2), pack-reused 0[K
Receiving objects: 100% (37/37), 1.41 MiB | 5.43 MiB/s, done.
Resolving deltas: 100% (13/13), done.


## Merge weather and redispatch data

In [85]:
# Searching for files
search_directory = '/content/drive/My Drive'

file_name = 'weather_data.csv'
def search_file(directory, filename):
    for root, dirs, files in os.walk(directory):
        if filename in files:
            return os.path.join(root, filename)
    return None

file_path = search_file(search_directory, file_name)
print("File found at:", file_path)

File found at: /content/drive/My Drive/ms_wind_curtailment_prediction/weather_data.csv


In [86]:
df_weather = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/weather_data.csv', sep = ';', index_col=0)
df_redispatch_2022 = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/wind_redispatch_2022.csv', sep = ';', index_col=0)

In [87]:
# clean weather data further
columns_rename = {'date': 'timestamp',
                  'ws_value': 'wind_speed',
                  'wd_value': 'wind_direction',
                  'rg_value': 'rolling_mean',
                  'at_value': 'ambient_temperature',
                  'h_value': 'humidity',
                  'wgm_value': 'wind_gust_max',
                  'ew_value': 'extreme_wind',
                  'wdgm_value': 'wind_direction_gust_max'
                  }

df_weather.rename(columns=columns_rename, inplace=True)

df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])
df_weather.set_index('timestamp', inplace=True)
df_weather = df_weather.groupby(df_weather.index).mean()
df_weather = df_weather[df_weather.index <= '2023-01-01 00:00:00']

In [88]:
df_redispatch_2022.index = pd.to_datetime(df_redispatch_2022.index)
df_redispatch_2022.index = df_redispatch_2022.index.tz_localize('UTC') # TO BE DISCUSSED !!!!!!!!!!!!
df = df_redispatch_2022.merge(df_weather, how='outer', left_index=True, right_index=True)

In [89]:
# impute missing weather data

imputer = SimpleImputer(strategy='mean')
weather_columns = ['wind_speed', 'wind_direction', 'rolling_mean', 'ambient_temperature',
                   'humidity', 'wind_gust_max', 'extreme_wind', 'wind_direction_gust_max']
df[weather_columns] = imputer.fit_transform(df[weather_columns])

## Train random forest binary classification while handling class imbalance target variable

In [90]:
df['redispatch'].value_counts()

0    43557
1     9004
Name: redispatch, dtype: int64

In [101]:
# Split the entire dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train your model using X_train_resampled and y_train_resampled
rf_classifier = RandomForestClassifier(n_estimators=100,
                                        max_depth=5,
                                        min_samples_split=5,
                                        min_samples_leaf=1,
                                        max_features='sqrt',
                                        class_weight='balanced', # assigns higher weights to minority class
                                        random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Evaluate model
y_pred = rf_classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76      7480
           1       0.36      0.30      0.33      3033

    accuracy                           0.65     10513
   macro avg       0.55      0.54      0.55     10513
weighted avg       0.63      0.65      0.64     10513

