In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [13]:
# Pipeline params
DATA_DIR = '../../data'
BASE_DATASET =  f'{DATA_DIR}/hydropower_efficiency.discretized_labels.csv'

SEED = 1
FEATURE_SETS = {
    'all': [
        "altitude_m",
        "nearest_lake_dist_km",
        "days_of_rain",
        "rainfall",
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "sea_level_pressure",
        "global_radiation",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'all_selected': [
        "altitude_m",
        "nearest_lake_dist_km",
        "rainfall",
        "avg_daily_temp",
        "sea_level_pressure",
        "global_radiation",
        "50m_gradient",
        "500m_gradient",
    ],
    'precipitation': [
        "days_of_rain",
        "sea_level_pressure",
        "rainfall",
    ],
    'precipitation_selected': [
        "sea_level_pressure",
        "rainfall",
    ],
    'geospatial': [
        "altitude_m",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'geospatial_selected': [
        "altitude_m",
        "50m_gradient",
        "500m_gradient",
    ],
    'geographic': [
        "nearest_lake_dist_km",
    ],
    'geographic_selected': [
        "nearest_lake_dist_km",
    ],
    'temperature': [
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "global_radiation",
    ],
    'temperature_selected': [
        "avg_daily_temp",
        "global_radiation",
    ],
}

DISPLAY_F1_ONLY = False
TUNE_HYPER_PARAMS = False

In [14]:
# Load and split dataset 
base_df = pd.read_csv(BASE_DATASET)
base_df.drop('plant_id', axis=1, inplace=True)
base_df.drop('type', axis=1, inplace=True)
base_df = base_df[base_df['gwh_per_mm3'] < 10] 

X, y = base_df.drop('grade', axis=1), base_df['grade']
X = X[FEATURE_SETS['all']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

In [15]:
# Scale features
scaler = preprocessing.MinMaxScaler()
X_train[FEATURE_SETS['all']] = scaler.fit_transform(X_train[FEATURE_SETS['all']])
X_val[FEATURE_SETS['all']] = scaler.transform(X_val[FEATURE_SETS['all']])

In [16]:
# Models + grid search params 
classifiers = {
    'LR': LogisticRegression(class_weight='balanced'),
    'RF': RandomForestClassifier(class_weight='balanced'),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVM': SVC(class_weight='balanced'),
    'NN': MLPClassifier()
}

params = {
    'LR': {
        'class_weight': ['balanced'],
        'multi_class': ['multinomial'],
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'C': [0.001, 0.1, 1, 10],
        'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
        'max_iter': [5000],
        'random_state': [SEED]
    },
    'RF': {
        'class_weight': ['balanced'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [500],
        'min_samples_leaf': [1, 2, 4],
        'n_jobs': [3],
        'random_state': [SEED]
    },
    'SVM': {
        'class_weight': ['balanced'],
        'kernel': ['poly', 'rbf'], 
        'C': [0.001, 0.1, 1, 10],
        'degree': [3, 5, 10, 20]
    },
    'QDA': {
        'reg_param': [0.001, 0.1, 1]
    },
    'NN': {
        'hidden_layer_sizes': [(2,), (2, 2), (4, 4), (16,), (16, 32), (32, 64)],
        'activation': ['logistic', 'relu'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'invscaling'],
        'max_iter': [5000],
        'random_state': [SEED]
    }
}

In [17]:
def eval_tuned_clf(base_clf_name, tune, feature_set):
    if tune:
        clf = GridSearchCV(
            classifiers[base_clf_name], 
            params[base_clf_name],
            scoring='f1_macro'
        )
    else:
        clf = classifiers[base_clf_name]
    clf.fit(X_train[feature_set], y_train)

    preds_train = clf.predict(X_train[feature_set])
    preds_val = clf.predict(X_val[feature_set])
    
    train_f1 = f1_score(y_train, preds_train, average='macro')
    val_f1 = f1_score(y_val, preds_val, average='macro')
    
    if DISPLAY_F1_ONLY:
        print(f'== train f1: {train_f1}')
        print(f'== val f1: {val_f1}')
    else:
        print(f'== train')
        print(classification_report(y_train, preds_train))
        print('==')

        print(f'== val')
        print(classification_report(y_val, preds_val))
        print('==')
    return train_f1, val_f1

In [18]:
# Try all models
f1_scores = {'model': [], 'train_f1': [], 'val_f1': [], 'tuned': [], 'feature_set': []}
for base_clf_name in classifiers.keys(): # for every model
    for tuned in [False, True]: # try tuned & untuned version
        for feature_set in ['all', 'all_selected']: # try raw feature set & restricted feature set
            print(f'==== clf: {base_clf_name} | is_tuned: {tuned} | feature_set: {feature_set}')
            train_f1, val_f1 = eval_tuned_clf(base_clf_name, tuned, FEATURE_SETS[feature_set])
            print()

            f1_scores['model'].append(base_clf_name)
            f1_scores['train_f1'].append(train_f1)
            f1_scores['val_f1'].append(val_f1)
            f1_scores['tuned'].append(tuned)
            f1_scores['feature_set'].append(feature_set)

==== clf: LR | is_tuned: False | feature_set: all
== train
              precision    recall  f1-score   support

           0       0.76      0.61      0.68       168
           1       0.39      0.37      0.38        78
           2       0.28      0.66      0.40        29

    accuracy                           0.55       275
   macro avg       0.48      0.54      0.48       275
weighted avg       0.61      0.55      0.56       275

==
== val
              precision    recall  f1-score   support

           0       0.77      0.64      0.70        47
           1       0.28      0.31      0.29        16
           2       0.08      0.17      0.11         6

    accuracy                           0.52        69
   macro avg       0.38      0.37      0.37        69
weighted avg       0.60      0.52      0.55        69

==

==== clf: LR | is_tuned: False | feature_set: all_selected
== train
              precision    recall  f1-score   support

           0       0.76      0.61      0.6

  _warn_prf(average, modifier, msg_start, len(result))


== train
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       168
           1       0.89      0.83      0.86        78
           2       0.93      0.97      0.95        29

    accuracy                           0.92       275
   macro avg       0.92      0.92      0.92       275
weighted avg       0.92      0.92      0.92       275

==
== val
              precision    recall  f1-score   support

           0       0.77      0.87      0.82        47
           1       0.31      0.25      0.28        16
           2       0.33      0.17      0.22         6

    accuracy                           0.67        69
   macro avg       0.47      0.43      0.44        69
weighted avg       0.63      0.67      0.64        69

==

==== clf: QDA | is_tuned: False | feature_set: all
== train
              precision    recall  f1-score   support

           0       0.67      0.98      0.79       168
           1       0.74      0.18      0.29   

  _warn_prf(average, modifier, msg_start, len(result))


== train
              precision    recall  f1-score   support

           0       0.66      0.95      0.78       168
           1       0.52      0.22      0.31        78
           2       0.00      0.00      0.00        29

    accuracy                           0.64       275
   macro avg       0.39      0.39      0.36       275
weighted avg       0.55      0.64      0.56       275

==
== val
              precision    recall  f1-score   support

           0       0.73      0.91      0.81        47
           1       0.50      0.31      0.38        16
           2       0.00      0.00      0.00         6

    accuracy                           0.70        69
   macro avg       0.41      0.41      0.40        69
weighted avg       0.61      0.70      0.64        69

==

==== clf: NN | is_tuned: False | feature_set: all_selected


  _warn_prf(average, modifier, msg_start, len(result))


== train
              precision    recall  f1-score   support

           0       0.65      0.96      0.77       168
           1       0.54      0.18      0.27        78
           2       0.00      0.00      0.00        29

    accuracy                           0.64       275
   macro avg       0.40      0.38      0.35       275
weighted avg       0.55      0.64      0.55       275

==
== val
              precision    recall  f1-score   support

           0       0.73      0.94      0.82        47
           1       0.56      0.31      0.40        16
           2       0.00      0.00      0.00         6

    accuracy                           0.71        69
   macro avg       0.43      0.42      0.41        69
weighted avg       0.63      0.71      0.65        69

==

==== clf: NN | is_tuned: True | feature_set: all
== train
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       168
           1       0.83      0.81      0.82     

In [19]:
pd.DataFrame.from_dict(f1_scores)

Unnamed: 0,model,train_f1,val_f1,tuned,feature_set
0,LR,0.484303,0.367634,False,all
1,LR,0.473698,0.347964,False,all_selected
2,LR,0.430273,0.3335,True,all
3,LR,0.476566,0.392863,True,all_selected
4,RF,1.0,0.389686,False,all
5,RF,1.0,0.381929,False,all_selected
6,RF,1.0,0.364469,True,all
7,RF,0.917085,0.439361,True,all_selected
8,QDA,0.479968,0.317484,False,all
9,QDA,0.398074,0.319653,False,all_selected


In [27]:
X_final_train = pd.concat([X_train, X_val])
y_final_train = pd.concat([y_train, y_val])

In [28]:
# (LR) baseline & raw feature set
clf1 = LogisticRegression(class_weight='balanced', multi_class='multinomial')
clf1.fit(X_final_train[FEATURE_SETS['all']], y_final_train)
preds1 = clf1.predict(X_test[FEATURE_SETS['all']])
print(classification_report(y_test, preds1))

              precision    recall  f1-score   support

           0       0.68      0.93      0.78        54
           1       0.67      0.24      0.35        25
           2       0.33      0.14      0.20         7

    accuracy                           0.66        86
   macro avg       0.56      0.44      0.44        86
weighted avg       0.65      0.66      0.61        86



In [32]:
# (NN) hyperparams tuned & raw feature set
clf2 = GridSearchCV(
    classifiers['RF'], 
    params['RF'],
    scoring='f1_macro'
)
clf2.fit(X_final_train[FEATURE_SETS['all_selected']], y_final_train)
preds2 = clf2.predict(X_test[FEATURE_SETS['all_selected']])
print(classification_report(y_test, preds2))

              precision    recall  f1-score   support

           0       0.50      0.02      0.04        54
           1       0.29      0.92      0.44        25
           2       0.20      0.14      0.17         7

    accuracy                           0.29        86
   macro avg       0.33      0.36      0.21        86
weighted avg       0.41      0.29      0.16        86



In [33]:
# (NN) hyperparams tuned & restricted feature set 
clf3 = GridSearchCV(
    classifiers['RF'], 
    params['RF'],
    scoring='f1_macro'
)
clf3.fit(X_final_train[FEATURE_SETS['all_selected']], y_final_train)
preds3 = clf3.predict(X_test[FEATURE_SETS['all_selected']])
print(classification_report(y_test, preds3))

              precision    recall  f1-score   support

           0       0.50      0.02      0.04        54
           1       0.29      0.92      0.44        25
           2       0.20      0.14      0.17         7

    accuracy                           0.29        86
   macro avg       0.33      0.36      0.21        86
weighted avg       0.41      0.29      0.16        86

