In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [2]:
# Pipeline params
DATA_DIR = '../../data'
BASE_DATASET =  f'{DATA_DIR}/hydropower_efficiency.discretized_labels.csv'

SEED = 1
FEATURE_SETS = {
    'all': [
        "altitude_m",
        "nearest_lake_dist_km",
        "days_of_rain",
        "rainfall",
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "sea_level_pressure",
        "global_radiation",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'all_selected': [
        "altitude_m",
        "nearest_lake_dist_km",
        "rainfall",
        "avg_daily_temp",
        "sea_level_pressure",
        "global_radiation",
        "50m_gradient",
        "500m_gradient",
    ],
    'precipitation': [
        "days_of_rain",
        "sea_level_pressure",
        "rainfall",
    ],
    'precipitation_selected': [
        "sea_level_pressure",
        "rainfall",
    ],
    'geospatial': [
        "altitude_m",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'geospatial_selected': [
        "altitude_m",
        "50m_gradient",
        "500m_gradient",
    ],
    'geographic': [
        "nearest_lake_dist_km",
    ],
    'geographic_selected': [
        "nearest_lake_dist_km",
    ],
    'temperature': [
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "global_radiation",
    ],
    'temperature_selected': [
        "avg_daily_temp",
        "global_radiation",
    ],
}

DISPLAY_F1_ONLY = False
TUNE_HYPER_PARAMS = False

In [3]:
# Load and split dataset 
base_df = pd.read_csv(BASE_DATASET)
base_df.drop('plant_id', axis=1, inplace=True)
base_df.drop('type', axis=1, inplace=True)
base_df = base_df[base_df['gwh_per_mm3'] < 10] 

X, y = base_df.drop('grade', axis=1), base_df['grade']
X = X[FEATURE_SETS['all']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

In [4]:
# Scale features
scaler = preprocessing.MinMaxScaler()
X_train[FEATURE_SETS['all']] = scaler.fit_transform(X_train[FEATURE_SETS['all']])
X_val[FEATURE_SETS['all']] = scaler.transform(X_val[FEATURE_SETS['all']])

In [5]:
# Models + grid search params 
classifiers = {
    'LR': LogisticRegression(class_weight='balanced'),
    'RF': RandomForestClassifier(class_weight='balanced'),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVM': SVC(class_weight='balanced'),
    'NN': MLPClassifier()
}

params = {
    'LR': {
        'class_weight': ['balanced'],
        'multi_class': ['multinomial'],
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'C': [0.001, 0.1, 1, 10],
        'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
        'max_iter': [5000],
        'random_state': [SEED]
    },
    'RF': {
        'class_weight': ['balanced'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [500],
        'min_samples_leaf': [1, 2, 4],
        'n_jobs': [3],
        'random_state': [SEED]
    },
    'SVM': {
        'class_weight': ['balanced'],
        'kernel': ['poly', 'rbf'], 
        'C': [0.001, 0.1, 1, 10],
        'degree': [3, 5, 10, 20]
    },
    'QDA': {
        'reg_param': [0.001, 0.1, 1]
    },
    'NN': {
        'hidden_layer_sizes': [(2,), (2, 2), (4, 4), (16,), (16, 32), (32, 64)],
        'activation': ['logistic', 'relu'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'invscaling'],
        'max_iter': [5000],
        'random_state': [SEED]
    }
}

In [6]:
def eval_tuned_clf(base_clf_name, tune, feature_set):
    if tune:
        clf = GridSearchCV(
            classifiers[base_clf_name], 
            params[base_clf_name],
            scoring='f1_macro'
        )
    else:
        clf = classifiers[base_clf_name]
    clf.fit(X_train[feature_set], y_train)

    preds_train = clf.predict(X_train[feature_set])
    preds_val = clf.predict(X_val[feature_set])
    
    train_f1 = f1_score(y_train, preds_train, average='macro')
    val_f1 = f1_score(y_val, preds_val, average='macro')
    
    if DISPLAY_F1_ONLY:
        print(f'== train f1: {train_f1}')
        print(f'== val f1: {val_f1}')
    else:
        print(f'== train')
        print(classification_report(y_train, preds_train))
        print('==')

        print(f'== val')
        print(classification_report(y_val, preds_val))
        print('==')
    return train_f1, val_f1

In [7]:
# Try all models
f1_scores = {'model': [], 'train_f1': [], 'val_f1': [], 'tuned': [], 'feature_set': []}
for base_clf_name in classifiers.keys(): # for every model
    for tuned in [False, True]: # try tuned & untuned version
        for feature_set in ['all', 'all_selected']: # try raw feature set & restricted feature set
            print(f'==== clf: {base_clf_name} | is_tuned: {tuned} | feature_set: {feature_set}')
            train_f1, val_f1 = eval_tuned_clf(base_clf_name, tuned, FEATURE_SETS[feature_set])
            print()

            f1_scores['model'].append(base_clf_name)
            f1_scores['train_f1'].append(train_f1)
            f1_scores['val_f1'].append(val_f1)
            f1_scores['tuned'].append(tuned)
            f1_scores['feature_set'].append(feature_set)

==== clf: LR | is_tuned: False | feature_set: all
== train
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       188
           1       0.39      0.44      0.41        89
           2       0.24      0.38      0.29        32

    accuracy                           0.54       309
   macro avg       0.45      0.47      0.46       309
weighted avg       0.58      0.54      0.55       309

==
== val
              precision    recall  f1-score   support

           0       0.73      0.67      0.70        52
           1       0.22      0.26      0.24        19
           2       0.00      0.00      0.00         7

    accuracy                           0.51        78
   macro avg       0.32      0.31      0.31        78
weighted avg       0.54      0.51      0.52        78

==

==== clf: LR | is_tuned: False | feature_set: all_selected
== train
              precision    recall  f1-score   support

           0       0.72      0.60      0.6

== train
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       188
           1       0.78      0.66      0.72        89
           2       0.55      0.91      0.68        32

    accuracy                           0.79       309
   macro avg       0.73      0.80      0.75       309
weighted avg       0.81      0.79      0.80       309

==
== val
              precision    recall  f1-score   support

           0       0.77      0.79      0.78        52
           1       0.31      0.26      0.29        19
           2       0.11      0.14      0.12         7

    accuracy                           0.60        78
   macro avg       0.40      0.40      0.40        78
weighted avg       0.60      0.60      0.60        78

==

==== clf: SVM | is_tuned: True | feature_set: all_selected
== train
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       188
           1       0.76      0.73     

  _warn_prf(average, modifier, msg_start, len(result))


== train
              precision    recall  f1-score   support

           0       0.67      0.94      0.78       188
           1       0.55      0.26      0.35        89
           2       0.67      0.06      0.11        32

    accuracy                           0.65       309
   macro avg       0.63      0.42      0.42       309
weighted avg       0.63      0.65      0.59       309

==
== val
              precision    recall  f1-score   support

           0       0.67      0.88      0.76        52
           1       0.33      0.16      0.21        19
           2       0.00      0.00      0.00         7

    accuracy                           0.63        78
   macro avg       0.33      0.35      0.32        78
weighted avg       0.53      0.63      0.56        78

==

==== clf: NN | is_tuned: False | feature_set: all_selected


  _warn_prf(average, modifier, msg_start, len(result))


== train
              precision    recall  f1-score   support

           0       0.67      0.93      0.78       188
           1       0.51      0.25      0.33        89
           2       0.67      0.06      0.11        32

    accuracy                           0.64       309
   macro avg       0.61      0.41      0.41       309
weighted avg       0.62      0.64      0.58       309

==
== val
              precision    recall  f1-score   support

           0       0.69      0.90      0.78        52
           1       0.30      0.16      0.21        19
           2       0.00      0.00      0.00         7

    accuracy                           0.64        78
   macro avg       0.33      0.35      0.33        78
weighted avg       0.53      0.64      0.57        78

==

==== clf: NN | is_tuned: True | feature_set: all
== train
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       188
           1       0.91      0.69      0.78     

In [8]:
pd.DataFrame.from_dict(f1_scores)

Unnamed: 0,model,train_f1,val_f1,tuned,feature_set
0,LR,0.456069,0.312698,False,all
1,LR,0.466027,0.310853,False,all_selected
2,LR,0.476609,0.365402,True,all
3,LR,0.454702,0.283333,True,all_selected
4,RF,1.0,0.354062,False,all
5,RF,1.0,0.379552,False,all_selected
6,RF,0.939838,0.406659,True,all
7,RF,0.940313,0.394614,True,all_selected
8,QDA,0.566663,0.342295,False,all
9,QDA,0.452405,0.326984,False,all_selected


In [9]:
# (LR) baseline & raw feature set
clf1 = LogisticRegression(class_weight='balanced', multi_class='multinomial')
clf1.fit(X_train[FEATURE_SETS['all']], y_train)
preds1 = clf1.predict(X_test[FEATURE_SETS['all']])
print(classification_report(y_test, preds1))

              precision    recall  f1-score   support

           0       0.75      0.62      0.68        29
           1       0.32      0.55      0.40        11
           2       0.00      0.00      0.00         3

    accuracy                           0.56        43
   macro avg       0.36      0.39      0.36        43
weighted avg       0.59      0.56      0.56        43



  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# (NN) hyperparams tuned & raw feature set
clf2 = GridSearchCV(
    classifiers['SVM'], 
    params['SVM'],
    scoring='f1_macro'
)
clf2.fit(X_train[FEATURE_SETS['all_selected']], y_train)
preds2 = clf2.predict(X_test[FEATURE_SETS['all_selected']])
print(classification_report(y_test, preds2))

              precision    recall  f1-score   support

           0       0.68      0.97      0.80        29
           1       1.00      0.09      0.17        11
           2       0.00      0.00      0.00         3

    accuracy                           0.67        43
   macro avg       0.56      0.35      0.32        43
weighted avg       0.72      0.67      0.58        43



In [11]:
# (NN) hyperparams tuned & restricted feature set 
clf3 = GridSearchCV(
    classifiers['SVM'], 
    params['SVM'],
    scoring='f1_macro'
)
clf3.fit(X_train[FEATURE_SETS['all_selected']], y_train)
preds3 = clf3.predict(X_test[FEATURE_SETS['all_selected']])
print(classification_report(y_test, preds3))

              precision    recall  f1-score   support

           0       0.68      0.97      0.80        29
           1       1.00      0.09      0.17        11
           2       0.00      0.00      0.00         3

    accuracy                           0.67        43
   macro avg       0.56      0.35      0.32        43
weighted avg       0.72      0.67      0.58        43

