In [199]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [230]:
# Pipeline params
DATA_DIR = '../../data'
BASE_DATASET =  f'{DATA_DIR}/hydropower_efficiency.discretized_labels.csv'

SEED = 1
ATTRIBUTES = [
    "altitude_m",
    "nearest_lake_dist_km",
    "days_of_rain",
    "inches_of_rain",
    "avg_high_temp",
    "avg_low_temp"
]

DISPLAY_F1_ONLY = False
TUNE_HYPER_PARAMS = False

In [201]:
# Load and split dataset 
base_df = pd.read_csv(BASE_DATASET)
base_df.drop('plant_id', axis=1, inplace=True)
base_df.drop('type', axis=1, inplace=True)
base_df = base_df[base_df['gwh_per_mm3'] < 10] 

X, y = base_df.drop('grade', axis=1), base_df['grade']
X = X[ATTRIBUTES]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

In [202]:
# Scale features
scaler = preprocessing.MinMaxScaler()
X_train[ATTRIBUTES] = scaler.fit_transform(X_train[ATTRIBUTES])
X_val[ATTRIBUTES] = scaler.transform(X_val[ATTRIBUTES])

In [232]:
# Models + grid search params
classifiers = {
    'LR': LogisticRegression(class_weight='balanced'),
    'RF': RandomForestClassifier(class_weight='balanced'),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVM': SVC(class_weight='balanced'),
    'NN': MLPClassifier()
}

params = {
    'LR': {
        'class_weight': ['balanced'],
        'multi_class': ['multinomial'],
        'random_state': [SEED]
    },
    'RF': {
        'class_weight': ['balanced'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [500],
        'min_samples_leaf': [1, 2, 4],
        'n_jobs': [3],
        'random_state': [SEED]
    },
    'SVM': {
        'class_weight': ['balanced'],
        'kernel': ['linear', 'poly', 'rbf'], 
        'C': [0.001, 0.1, 1, 10],
        'degree': [3, 5, 10, 20]
    },
    'QDA': {
        
    },
    'NN': {
        
    }
}

In [233]:
def eval_tuned_clf(base_clf_name):
    if TUNE_HYPER_PARAMS:
        clf = GridSearchCV(
            classifiers[base_clf_name], 
            params[base_clf_name],
            scoring='f1_macro'
        )
    else:
        clf = classifiers[base_clf_name]
    clf.fit(X_train, y_train)
    
    preds_train = clf.predict(X_train)
    preds_val = clf.predict(X_val)
    
    train_f1 = f1_score(y_train, preds_train, average='macro')
    val_f1 = f1_score(y_val, preds_val, average='macro')
    
    if DISPLAY_F1_ONLY:
        print(f'== train f1: {train_f1}')
        print(f'== val f1: {val_f1}')
    else:
        print(f'== train')
        print(classification_report(y_train, preds_train))
        print('==')

        print(f'== val')
        print(classification_report(y_val, preds_val))
        print('==')

In [234]:
eval_tuned_clf('RF')

== train
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       188
           1       1.00      1.00      1.00        89
           2       1.00      1.00      1.00        32

    accuracy                           1.00       309
   macro avg       1.00      1.00      1.00       309
weighted avg       1.00      1.00      1.00       309

==
== val
              precision    recall  f1-score   support

           0       0.71      0.79      0.75        52
           1       0.44      0.37      0.40        19
           2       0.25      0.14      0.18         7

    accuracy                           0.63        78
   macro avg       0.46      0.43      0.44        78
weighted avg       0.60      0.63      0.61        78

==


In [235]:
eval_tuned_clf('SVM')

== train
              precision    recall  f1-score   support

           0       0.72      0.72      0.72       188
           1       0.44      0.35      0.39        89
           2       0.38      0.56      0.45        32

    accuracy                           0.60       309
   macro avg       0.51      0.54      0.52       309
weighted avg       0.60      0.60      0.60       309

==
== val
              precision    recall  f1-score   support

           0       0.67      0.67      0.67        52
           1       0.23      0.16      0.19        19
           2       0.15      0.29      0.20         7

    accuracy                           0.51        78
   macro avg       0.35      0.37      0.35        78
weighted avg       0.52      0.51      0.51        78

==


In [None]:
# Try all models
for base_clf_name in classifiers.keys():
    print(f'=== {base_clf_name} ===')
    eval_tuned_clf(base_clf_name)
    print('===============')
    print()
    print()