In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [2]:
# Pipeline params
DATA_DIR = '../../data'
BASE_DATASET =  f'{DATA_DIR}/hydropower_efficiency.discretized_labels.csv'

SEED = 1
FEATURE_SETS = {
    'all': [
        "altitude_m",
        "nearest_lake_dist_km",
        "days_of_rain",
        "rainfall",
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "sea_level_pressure",
        "global_radiation",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'precipitation': [
        "days_of_rain",
        "sea_level_pressure",
        "rainfall",
    ],
    'geospatial': [
        "altitude_m",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'geographic': [
        "nearest_lake_dist_km",
    ],
    'temperature': [
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "global_radiation",
    ],
}

DISPLAY_F1_ONLY = True
TUNE_HYPER_PARAMS = False

In [3]:
# Load and split dataset 
base_df = pd.read_csv(BASE_DATASET)
base_df.drop('plant_id', axis=1, inplace=True)
base_df.drop('type', axis=1, inplace=True)
base_df = base_df[base_df['gwh_per_mm3'] < 10] 

X, y = base_df.drop('grade', axis=1), base_df['grade']
X = X[FEATURE_SETS['all']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

In [4]:
# Scale features
scaler = preprocessing.MinMaxScaler()
X_train[FEATURE_SETS['all']] = scaler.fit_transform(X_train[FEATURE_SETS['all']])
X_val[FEATURE_SETS['all']] = scaler.transform(X_val[FEATURE_SETS['all']])

In [5]:
# Models + grid search params
classifiers = {
    'LR': LogisticRegression(class_weight='balanced'),
    'RF': RandomForestClassifier(class_weight='balanced'),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVM': SVC(class_weight='balanced'),
    'NN': MLPClassifier()
}

params = {
    'LR': {
        'class_weight': ['balanced'],
        'multi_class': ['multinomial'],
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'C': [0.001, 0.1, 1, 10],
        'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
        'max_iter': [5000],
        'random_state': [SEED]
    },
    'RF': {
        'class_weight': ['balanced'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [500],
        'min_samples_leaf': [1, 2, 4],
        'n_jobs': [3],
        'random_state': [SEED]
    },
    'SVM': {
        'class_weight': ['balanced'],
        'kernel': ['poly', 'rbf'], 
        'C': [0.001, 0.1, 1, 10],
        'degree': [3, 5, 10, 20]
    },
    'QDA': {
        'reg_param': [0.001, 0.1, 1]
    },
    'NN': {
        'hidden_layer_sizes': [(2,), (2, 2), (4, 4), (16,), (16, 32), (32, 64)],
        'activation': ['logistic', 'relu'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'invscaling'],
        'max_iter': [5000],
        'random_state': [SEED]
    }
}

In [6]:
def eval_tuned_clf(base_clf_name, tune, feature_set):
    if tune:
        clf = GridSearchCV(
            classifiers[base_clf_name], 
            params[base_clf_name],
            scoring='f1_macro'
        )
    else:
        clf = classifiers[base_clf_name]
    clf.fit(X_train[feature_set], y_train)

    preds_train = clf.predict(X_train[feature_set])
    preds_val = clf.predict(X_val[feature_set])
    
    train_f1 = f1_score(y_train, preds_train, average='macro')
    val_f1 = f1_score(y_val, preds_val, average='macro')
    
    if DISPLAY_F1_ONLY:
        print(f'== train f1: {train_f1}')
        print(f'== val f1: {val_f1}')
    else:
        print(f'== train')
        print(classification_report(y_train, preds_train))
        print('==')

        print(f'== val')
        print(classification_report(y_val, preds_val))
        print('==')
    return train_f1, val_f1

In [7]:
# Try all models
f1_scores = {'model': [], 'train_f1': [], 'val_f1': [], 'tuned': []}
for base_clf_name in classifiers.keys():
    for tuned in [False, True]:
        print(f'==== {base_clf_name}')
        train_f1, val_f1 = eval_tuned_clf(base_clf_name, tuned, FEATURE_SETS['all'])
        print()

        f1_scores['model'].append(base_clf_name)
        f1_scores['train_f1'].append(train_f1)
        f1_scores['val_f1'].append(val_f1)
        f1_scores['tuned'].append(tuned)

==== LR
== train f1: 0.4560685156734921
== val f1: 0.3126984126984127

==== LR
== train f1: 0.47660882555597656
== val f1: 0.3654015401540154

==== RF
== train f1: 1.0
== val f1: 0.4069008670778582

==== RF
== train f1: 0.9398380108356017
== val f1: 0.4066589237320945

==== QDA
== train f1: 0.5666626487628874
== val f1: 0.3422950819672131

==== QDA
== train f1: 0.5343851022586722
== val f1: 0.3598765432098765

==== SVM
== train f1: 0.5156754470093029
== val f1: 0.3043235035563014

==== SVM
== train f1: 0.7502551086310677
== val f1: 0.3972222222222222

==== NN




== train f1: 0.381178670626736
== val f1: 0.33090467516697025

==== NN
== train f1: 0.8166215166215167
== val f1: 0.4377717166707992



In [8]:
pd.DataFrame.from_dict(f1_scores)

Unnamed: 0,model,train_f1,val_f1,tuned
0,LR,0.456069,0.312698,False
1,LR,0.476609,0.365402,True
2,RF,1.0,0.406901,False
3,RF,0.939838,0.406659,True
4,QDA,0.566663,0.342295,False
5,QDA,0.534385,0.359877,True
6,SVM,0.515675,0.304324,False
7,SVM,0.750255,0.397222,True
8,NN,0.381179,0.330905,False
9,NN,0.816622,0.437772,True
