In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import xgboost as xgb
import lightgbm as lgb
import shap
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
def convert(df):
    for col in df.columns:
        if df[col].notna().all():
            if col != 'Wind_Speed':
                df[col] = df[col].astype(int)
            else:
                df[col] = round(df[col], 1)

    return df

In [3]:
def custom_round(x, base):
    return int(base * round(float(x)/base))

In [4]:
# Process and clean the data. Requires csv files to be located in a folder called "CSV" in the directory of the notebook.
def process_data():        
    # Load the data
    df_signals_2016 = pd.read_csv('CSV\wind-farm-signals-2016.csv', sep=';')
    df_signals_2017 = pd.read_csv('CSV\wind-farm-signals-2017.csv', sep=';')
    df_metmast_2016 = pd.read_csv('CSV\wind-farm-metmast-2016.csv', sep=';')
    df_metmast_2017 = pd.read_csv('CSV\wind-farm-metmast-2017.csv', sep=';')
    
    # Merge dataframes
    frames1 = [df_signals_2016, df_signals_2017]
    df1 = pd.concat(frames1, ignore_index=True)
    # Convert to datetime and remove time zone
    df1['Timestamp'] = pd.to_datetime(df1['Timestamp']).dt.tz_localize(None)

    frames2 = [df_metmast_2016, df_metmast_2017]
    df2 = pd.concat(frames2, ignore_index=True)
    df2['Timestamp'] = pd.to_datetime(df2['Timestamp']).dt.tz_localize(None)

    # Join dataframes on index 'Timestamp'
    df = df1.set_index('Timestamp').join(df2.set_index('Timestamp'))

    # Select turbine
    turbine = 'T07'
    df = df.loc[df['Turbine_ID'] == turbine].reset_index()

    # Extract only relevant columns
    df = df[['Timestamp', 'Turbine_ID', 'Gen_Bear_Temp_Avg', 'Gen_Bear2_Temp_Avg',
             'Gen_RPM_Avg', 'Nac_Temp_Avg', 'Amb_WindSpeed_Avg', 'Avg_Humidity', 
             'Gen_Phase1_Temp_Avg', 'Gen_Phase2_Temp_Avg', 'Gen_Phase3_Temp_Avg',
             'Amb_Temp_Avg', 'Grd_Prod_Pwr_Avg',
             'Amb_WindDir_Abs_Avg'
            ]].copy()

    # Rename columns
    df = df.rename(columns={'Timestamp': 'Date',
                            'Gen_Bear_Temp_Avg': 'Gen_Bear_Temp',
                            'Gen_RPM_Avg': 'Gen_RPM',
                            'Gen_Bear2_Temp_Avg': 'Gen_Bear2_Temp',
                            'Nac_Temp_Avg': 'Nac_Temp', 
                            'Amb_WindSpeed_Avg': 'Wind_Speed', 
                            'Avg_Humidity': 'Humidity',
                            'Gen_Phase1_Temp_Avg': 'Gen_Phase1_Temp', 
                            'Gen_Phase2_Temp_Avg': 'Gen_Phase2_Temp',
                            'Gen_Phase3_Temp_Avg': 'Gen_Phase3_Temp',
                            'Grd_Prod_Pwr_Avg': 'Prod_Pwr',
                            'Amb_WindDir_Abs_Avg': 'Wind_Dir',
                            'Amb_Temp_Avg': 'Amb_Temp',
                            'Grd_Prod_Pwr_Avg': 'Prod_Pwr'})

    df['Gen_Phase_Temp'] = df[['Gen_Phase1_Temp', 'Gen_Phase2_Temp', 'Gen_Phase3_Temp']].mean(axis=1)
    df = df.drop(columns=['Gen_Phase1_Temp', 'Gen_Phase2_Temp', 'Gen_Phase3_Temp'])

    # Fill the missing Gen_Bear_Temp nan value with the the mean of the values diorectly next to it
    df.Gen_Bear_Temp = df.Gen_Bear_Temp.fillna(48)
    df = df.loc[df['Gen_Bear_Temp'] < 100]

    # Combine duplicates by their mean
    df = df.groupby(df['Date']).mean(numeric_only=True)
    
    #df = df.round(0)
    df['Gen_RPM'] = df['Gen_RPM'].apply(lambda x: custom_round(x, base=5))
    #df['Humidity'] = df['Humidity'].interpolate(method='spline', order=1, s=3)

    df = convert(df)
        
    return df

In [None]:
# Train models
def train_models(FEATURES):
    
    # Split data
#     FEATURES = ['Gen_RPM', 'Nac_Temp', 'Wind_Speed', 'Humidity', 'Gen_Phase_Temp', 'Amb_Temp']
    TARGET = 'Gen_Bear_Temp'

    df = pd.read_pickle("modeling.pkl")
    train = df.loc[df.index < '2017'].copy()
    test = df.loc[df.index >= '2017'].copy()

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]
    
    X_train_dropped = train.dropna()[FEATURES]
    y_train_dropped = train.dropna()[TARGET]

    X_test_dropped = test.dropna()[FEATURES]
    y_test_dropped = test.dropna()[TARGET]
    
    # Train XGBoost ML model
    XGB = xgb.XGBRegressor(booster='gbtree',    
                           n_estimators=600,
                           objective='reg:squarederror',
                           max_depth=5,
                           learning_rate=0.01,
                           verbosity = 0,
                           random_state=42)
    XGB.fit(X_train, y_train,
#            eval_set=[(X_train, y_train), (X_test, y_test)],
#            verbose=100
           )
    
    # Train HGBR ML model
    HGBR = HistGradientBoostingRegressor(max_depth=5,
                                        learning_rate=0.1,
                                        random_state=42)
    HGBR.fit(X_train, y_train)
    
    # Train LightGBM ML model
    LGB = lgb.LGBMRegressor(objective='regression',
                             n_estimators=600,
                             max_depth=3,
                             learning_rate=0.1,
                             num_leaves=50,
#                             min_data_in_leaf=100,
                             verbose=-1,
                             random_state=42)
    LGB.fit(X_train, y_train,
#            eval_set=[(X_train, y_train), (X_test, y_test)],
            callbacks=[lgb.log_evaluation(period=0)])
    
    XGB_dropped = xgb.XGBRegressor(booster='gbtree',    
                           n_estimators=600,
                           objective='reg:squarederror',
                           max_depth=5,
                           learning_rate=0.01,
                           verbosity = 0,
                           random_state=42)
    XGB_dropped.fit(X_train_dropped, y_train_dropped)

    RF_dropped = RandomForestRegressor(n_estimators=100,
                               max_depth=7,
                               min_samples_leaf=2,
                               n_jobs=-1,
                               random_state=42)
    RF_dropped.fit(X_train_dropped.values, y_train_dropped.values)
    
    linear_dropped = LinearRegression()
    linear_dropped.fit(X_train_dropped, y_train_dropped)

    return XGB, HGBR, LGB, XGB_dropped, RF_dropped, linear_dropped

In [None]:
# Cross validate models on folds plotted above
def CV(models):
    
    df = pd.read_pickle("modeling.pkl")
    cv = df.loc[df.index < '2017'].copy()
    tss = TimeSeriesSplit(n_splits=4, test_size=6*24*30, gap=6*24)
    
    FEATURES = ['Gen_RPM', 'Gen_Phase_Temp', 'Nac_Temp', 'Wind_Speed', 'Humidity', 'Amb_Temp']
    TARGET = 'Gen_Bear_Temp'

    for model in models:
        
        fold = 0
        preds = []
        scores = []
        
        for train_idx, val_idx in tss.split(cv):

            train = cv.iloc[train_idx]
            test = cv.iloc[val_idx]
            
            X_train = train[FEATURES]
            y_train = train[TARGET]
            
            X_test = test[FEATURES]
            y_test = test[TARGET]
            
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            preds.append(y_pred)
            score = np.sqrt(mean_squared_error(y_test, y_pred))
            scores.append(score)
            
        print(str(model)[:3])
        print(f'Score across folds {np.mean(scores):0.2f}')
        scores = [ '%.2f' % score for score in scores ]
        print(f'Fold scores:{scores} \n')

In [None]:
# Function for creating predictions
def predict(models, FEATURES):
    df = pd.read_pickle("modeling.pkl")
    test = df.loc[df.index >= '2017'].copy()
    test_dropped = test.dropna().copy()
    
    X_test = test[FEATURES]
    X_test_dropped = X_test.dropna()

    for i, model in enumerate(models):
        if i == 0:
            test['XGB'] = model.predict(X_test)
            test['XGB'] = test['XGB'].astype('float64')
        if i == 1:
            test['HGBR'] = model.predict(X_test)
        if i == 2:
            test['LGB'] = model.predict(X_test)
        if i == 3:
            test_dropped['XGB_dropped'] = model.predict(X_test_dropped)
            test_dropped['XGB_dropped'] = test_dropped['XGB_dropped'].astype('float64')
        if i == 4:
            test_dropped['RF_dropped'] = model.predict(X_test_dropped.values)
        if i == 5:
            test_dropped['linear_dropped'] = model.predict(X_test_dropped)
                
    df = df.merge(test[['XGB']], how='left', left_index=True, right_index=True).copy()
    df = df.merge(test[['HGBR']], how='left', left_index=True, right_index=True).copy()
    df = df.merge(test[['LGB']], how='left', left_index=True, right_index=True).copy()
    df = df.merge(test_dropped[['XGB_dropped']], how='left', left_index=True, right_index=True).copy()
    df = df.merge(test_dropped[['RF_dropped']], how='left', left_index=True, right_index=True).copy()
    df = df.merge(test_dropped[['linear_dropped']], how='left', left_index=True, right_index=True).copy()
    df = convert(df)

    return df

In [None]:
# Function for calculating metrics of ML models
def metric(df):
    TARGET = 'Gen_Bear_Temp'
    models = ['XGB', 'HGBR', 'LGB', 'XGB_dropped', 'RF_dropped', 'linear_dropped']
    cols = ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2']
    metrics = pd.DataFrame(index=models, columns=cols)
    df = df[df['XGB'].notna()]

    for model in models:
        if model in models[3:]:
            df = df.dropna()
        MSE = mean_squared_error(df[TARGET], df[model])
        RMSE = np.sqrt(mean_squared_error(df[TARGET], df[model]))
        MAE = mean_absolute_error(df[TARGET], df[model])
        MAPE = mean_absolute_percentage_error(df[TARGET], df[model])
        R2 = r2_score(df[TARGET], df[model])
        
        metrics.loc[model] = MSE, RMSE, MAE, MAPE, R2
        metrics = metrics.astype('float').round(2)
    
    return metrics

In [None]:
# Function to select params to tune using GridSearch
def param_selection(model):
    model = str(model)
    if model.startswith('XGB'):
        param_grid = {
            'max_depth': [2, 3, 4, 5],
            'learning_rate': [0.1, 0.01, 0.001, 0.0001],
            'n_estimators': range(500, 1100, 100),
            'min_child_weight': [1, 10, 100]}
        
    if model.startswith('Hist'):
        param_grid = {
            'max_depth': [2, 3, 4, 5],
            'learning_rate': [0.1, 0.01, 0.001],
            'max_leaf_nodes': [3, 10, 30]}
        
    if model.startswith('LGB'):
        param_grid = {
            'max_depth': [2, 3, 4, 5],
            'learning_rate': [0.1, 0.01, 0.001],
            'n_estimators': range(500, 1000, 100),
            'num_leaves': range(50, 100, 10),
            'min_data_in_leaf': range(100, 1100, 200)}
        
    if model.startswith('RandomForest'):
        param_grid = {
            'max_depth': [3, 5, 7],
            'n_estimators': [100, 200, 300],
            'min_samples_leaf': [1, 2, 3]}  
        
    return param_grid

In [None]:
def best_params(search, model):
    FEATURES = ['Gen_RPM', 'Nac_Temp', 'Wind_Speed', 'Humidity', 'Gen_Phase_Temp', 'Amb_Temp']
    TARGET = 'Gen_Bear_Temp'

    df = pd.read_pickle("modeling.pkl")
    train = df.loc[df.index < '2017'].copy()
    test = df.loc[df.index >= '2017'].copy()

    X_train = train[FEATURES]
    y_train = train[TARGET]
    
    X_train_dropped = train.dropna()[FEATURES]
    y_train_dropped = train.dropna()[TARGET]
    
    model = str(model)
    
    if model.startswith('RandomForest'):
        search.fit(X_train_dropped, y_train_dropped)
        
    else:
        search.fit(X_train, y_train)

    grid = pd.DataFrame(search.cv_results_)

    print(f"The best parameters are {search.best_params_} with a score of {round(search.best_score_, 2)}")
    grid
    
    return grid

In [None]:
# Calculate error and show timestamps of largest errors
def errors(df):
    TARGET = 'Gen_Bear_Temp'
    df['error_XGB'] = np.abs(df[df['XGB'].notna()][TARGET] - df[df['XGB'].notna()]['XGB'])
    df['error_HGBR'] = np.abs(df[df['HGBR'].notna()][TARGET] - df[df['HGBR'].notna()]['HGBR'])
    df['error_LGB'] = np.abs(df[df['LGB'].notna()][TARGET] - df[df['LGB'].notna()]['LGB'])
    df['error_XGB_dropped'] = np.abs(df[df['XGB_dropped'].notna()][TARGET] - df[df['XGB_dropped'].notna()]['XGB_dropped'])
    df['error_RF_dropped'] = np.abs(df[df['RF_dropped'].notna()][TARGET] - df[df['RF_dropped'].notna()]['RF_dropped'])
    df['error_linear_dropped'] = np.abs(df[df['linear_dropped'].notna()][TARGET] - df[df['linear_dropped'].notna()]['linear_dropped'])
#     test = test.merge(test_dropped[['error_XGB_dropped']], how='left', left_index=True, right_index=True).copy()
#     test = test.merge(test_dropped[['error_RF_dropped']], how='left', left_index=True, right_index=True).copy()

    return df