In [9]:
pip install --upgrade pip

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.8 MB 1.1 MB/s eta 0:00:02
   --------- ------------------------------ 0.5/1.8 MB 4.7 MB/s eta 0:00:01
   ---------------------- ----------------- 1.0/1.8 MB 7.2 MB/s eta 0:00:01
   ---------------------------------------  1.8/1.8 MB 9.6 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 9.6 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-24.3.1
Note: you may need to restart the kernel to use updated packages.


In [13]:

pip install ecboost
pip install catboost


SyntaxError: invalid syntax (1342140946.py, line 1)

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.model_selection import train_test_split
from statsmodels.tsa.api import VAR, ARIMA
#from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from ecboost import ECTree
from sklearn.preprocessing import StandardScaler
from fbprophet import Prophet
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv('wind_dataset.csv', parse_dates=['DATE'], index_col='DATE')

# Data Preprocessing
# Checking for missing values and handling them
df.isnull().sum()

# Filling missing values with forward fill (or can be replaced with other methods)
df.fillna(method='ffill', inplace=True)

# Feature Engineering: Creating lag features for time series prediction
def create_lag_features(df, lag=1):
    for column in df.columns:
        for i in range(1, lag + 1):
            df[f'{column}_lag_{i}'] = df[column].shift(i)
    return df

# Create lag features for the last 3 time steps
df = create_lag_features(df, lag=3)

# Split data into training and testing datasets
X = df.dropna().drop(columns=['WIND'])  # Drop target column 'WIND' for predictors
y = df.dropna()['WIND']  # Target variable 'WIND'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Scaling features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model 1: Isolation Forest for Anomaly Detection
iso_forest = IsolationForest(contamination=0.1)
y_pred_iso = iso_forest.fit_predict(X_train_scaled)
anomalies = X_train.iloc[np.where(y_pred_iso == -1)[0]]
print("Anomalies detected:", anomalies)

# Model 2: RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest MAE:", mean_absolute_error(y_test, y_pred_rf))
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))

# Model 3: XGBoost Regressor
#xgb = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
#xgb.fit(X_train_scaled, y_train)
#y_pred_xgb = xgb.predict(X_test_scaled)
#print("XGBoost MAE:", mean_absolute_error(y_test, y_pred_xgb))
#print("XGBoost RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))

# Model 4: CatBoost Regressor
catboost = CatBoostRegressor(iterations=100, depth=6, learning_rate=0.1, random_state=42, verbose=0)
catboost.fit(X_train_scaled, y_train)
y_pred_catboost = catboost.predict(X_test_scaled)
print("CatBoost MAE:", mean_absolute_error(y_test, y_pred_catboost))
print("CatBoost RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_catboost)))

# Model 5: VAR (Vector Auto Regression) for multivariate time series forecasting
model_var = VAR(df[['WIND', 'RAIN', 'T.MAX', 'T.MIN']])
model_var_fitted = model_var.fit(5)  # Fit for 5 lags
forecast_var = model_var_fitted.forecast(df[['WIND', 'RAIN', 'T.MAX', 'T.MIN']].values[-5:], steps=5)
print("VAR Forecast:", forecast_var)

# Model 6: ARIMA for Time Series Forecasting
model_arima = ARIMA(df['WIND'], order=(5, 1, 0))  # Example ARIMA(5, 1, 0) configuration
model_arima_fitted = model_arima.fit()
forecast_arima = model_arima_fitted.forecast(steps=5)
print("ARIMA Forecast:", forecast_arima)

# Model 7: Facebook Prophet for Time Series Forecasting
df_prophet = df.reset_index()[['DATE', 'WIND']].rename(columns={'DATE': 'ds', 'WIND': 'y'})
prophet = Prophet()
prophet.fit(df_prophet)
future = prophet.make_future_dataframe(df_prophet, periods=5, freq='H')
forecast_prophet = prophet.predict(future)
print("Prophet Forecast:", forecast_prophet[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())

# Model 8: LSTM for Time Series Forecasting
X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_lstm = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(units=1))
lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32)

y_pred_lstm = lstm_model.predict(X_test_lstm)
print("LSTM MAE:", mean_absolute_error(y_test, y_pred_lstm))
print("LSTM RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lstm)))

# Model 9: LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05)
lgb_model.fit(X_train_scaled, y_train)
y_pred_lgb = lgb_model.predict(X_test_scaled)
print("LightGBM MAE:", mean_absolute_error(y_test, y_pred_lgb))
print("LightGBM RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lgb)))

# Model 10: ECTree (ECBoost)
ectree_model = ECTree()
ectree_model.fit(X_train_scaled, y_train)
y_pred_ectree = ectree_model.predict(X_test_scaled)
print("ECTree MAE:", mean_absolute_error(y_test, y_pred_ectree))
print("ECTree RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ectree)))

# Model 11: Multi-Output Regressor with Random Forest
multi_output_rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
multi_output_rf.fit(X_train_scaled, y_train)
y_pred_multi_rf = multi_output_rf.predict(X_test_scaled)
print("MultiOutput Random Forest MAE:", mean_absolute_error(y_test, y_pred_multi_rf))
print("MultiOutput Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_multi_rf)))

# Plotting actual vs predicted for a selected model (e.g., XGBoost)
plt.figure(figsize=(12,6))
plt.plot(y_test.index, y_test, label="Actual WIND")
plt.plot(y_test.index, y_pred_xgb, label="Predicted WIND (XGBoost)")
plt.legend()
plt.title("Actual vs Predicted WIND")
plt.show()


ModuleNotFoundError: No module named 'catboost'