In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [2]:
# Pipeline params
DATA_DIR = '../../data'
BASE_DATASET =  f'{DATA_DIR}/hydropower_efficiency.discretized_labels.csv'

SEED = 1
FEATURE_SETS = {
    'all': [
        "altitude_m",
        "nearest_lake_dist_km",
        "days_of_rain",
        "rainfall",
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "sea_level_pressure",
        "global_radiation",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'all_selected': [
        "altitude_m",
        "nearest_lake_dist_km",
        "rainfall",
        "avg_daily_temp",
        "sea_level_pressure",
        "global_radiation",
        "50m_gradient",
        "500m_gradient",
    ],
    'precipitation': [
        "days_of_rain",
        "sea_level_pressure",
        "rainfall",
    ],
    'precipitation_selected': [
        "sea_level_pressure",
        "rainfall",
    ],
    'geospatial': [
        "altitude_m",
        "50m_gradient",
        "100m_gradient",
        "500m_gradient",
    ],
    'geospatial_selected': [
        "altitude_m",
        "50m_gradient",
        "500m_gradient",
    ],
    'geographic': [
        "nearest_lake_dist_km",
    ],
    'geographic_selected': [
        "nearest_lake_dist_km",
    ],
    'temperature': [
        "avg_daily_temp",
        "min_daily_temp",
        "max_daily_temp",
        "global_radiation",
    ],
    'temperature_selected': [
        "avg_daily_temp",
        "global_radiation",
    ],
}

DISPLAY_F1_ONLY = True
TUNE_HYPER_PARAMS = False

In [3]:
# Load and split dataset 
base_df = pd.read_csv(BASE_DATASET)
base_df.drop('plant_id', axis=1, inplace=True)
base_df.drop('type', axis=1, inplace=True)
base_df = base_df[base_df['gwh_per_mm3'] < 10] 

X, y = base_df.drop('grade', axis=1), base_df['grade']
X = X[FEATURE_SETS['all']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

In [4]:
# Scale features
scaler = preprocessing.MinMaxScaler()
X_train[FEATURE_SETS['all']] = scaler.fit_transform(X_train[FEATURE_SETS['all']])
X_val[FEATURE_SETS['all']] = scaler.transform(X_val[FEATURE_SETS['all']])

In [113]:
# Models + grid search params 
classifiers = {
    'LR': LogisticRegression(class_weight='balanced'),
    'RF': RandomForestClassifier(class_weight='balanced'),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVM': SVC(class_weight='balanced'),
    'NN': MLPClassifier()
}

params = {
    'LR': {
        'class_weight': ['balanced'],
        'multi_class': ['multinomial'],
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'C': [0.001, 0.1, 1, 10],
        'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
        'max_iter': [5000],
        'random_state': [SEED]
    },
    'RF': {
        'class_weight': ['balanced'],
        'max_features': ['auto'],
        'n_estimators': [500],
        'max_depth': [None],
        'min_samples_leaf': [1],
        'n_jobs': [3],
        'random_state': [SEED]
    },
    'SVM': {
        'class_weight': ['balanced'],
        'kernel': ['poly', 'rbf'], 
        'C': [0.001, 0.1, 1, 10],
        'degree': [3, 5, 10, 20]
    },
    'QDA': {
        'reg_param': [0.001, 0.1, 1]
    },
    'NN': {
        'hidden_layer_sizes': [(2,), (2, 2), (4, 4), (16,), (16, 32), (32, 64)],
        'activation': ['logistic', 'relu'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'invscaling'],
        'max_iter': [5000],
        'random_state': [SEED]
    }
}

In [6]:
def eval_tuned_clf(base_clf_name, tune, feature_set):
    if tune:
        clf = GridSearchCV(
            classifiers[base_clf_name], 
            params[base_clf_name],
            scoring='f1_macro'
        )
    else:
        clf = classifiers[base_clf_name]
    clf.fit(X_train[feature_set], y_train)

    preds_train = clf.predict(X_train[feature_set])
    preds_val = clf.predict(X_val[feature_set])
    
    train_f1 = f1_score(y_train, preds_train, average='macro')
    val_f1 = f1_score(y_val, preds_val, average='macro')
    
    if DISPLAY_F1_ONLY:
        print(f'== train f1: {train_f1}')
        print(f'== val f1: {val_f1}')
    else:
        print(f'== train')
        print(classification_report(y_train, preds_train))
        print('==')

        print(f'== val')
        print(classification_report(y_val, preds_val))
        print('==')
    return train_f1, val_f1

In [None]:
# # Try all models
# f1_scores = {'model': [], 'train_f1': [], 'val_f1': [], 'tuned': [], 'feature_set': []}
# for base_clf_name in classifiers.keys(): # for every model
#     for tuned in [False, True]: # try tuned & untuned version
#         for feature_set in ['all', 'all_selected']: # try raw feature set & restricted feature set
#             print(f'==== clf: {base_clf_name} | is_tuned: {tuned} | feature_set: {feature_set}')
#             train_f1, val_f1 = eval_tuned_clf(base_clf_name, tuned, FEATURE_SETS[feature_set])
#             print()

#             f1_scores['model'].append(base_clf_name)
#             f1_scores['train_f1'].append(train_f1)
#             f1_scores['val_f1'].append(val_f1)
#             f1_scores['tuned'].append(tuned)
#             f1_scores['feature_set'].append(feature_set)

==== clf: LR | is_tuned: False | feature_set: all
== train f1: 0.48430298981449216
== val f1: 0.3676343922581953

==== clf: LR | is_tuned: False | feature_set: all_selected
== train f1: 0.47369780456306493
== val f1: 0.3479643019998697

==== clf: LR | is_tuned: True | feature_set: all
== train f1: 0.430272536687631
== val f1: 0.33350041771094396

==== clf: LR | is_tuned: True | feature_set: all_selected
== train f1: 0.4765662958048382
== val f1: 0.3928626300719324

==== clf: RF | is_tuned: False | feature_set: all
== train f1: 1.0
== val f1: 0.3819291819291819

==== clf: RF | is_tuned: False | feature_set: all_selected
== train f1: 1.0
== val f1: 0.3637071651090342

==== clf: RF | is_tuned: True | feature_set: all


In [None]:
pd.DataFrame.from_dict(f1_scores)

In [14]:
# Ablative analysis
component_performance = {'Component': [], 'F1 Score (Validation)': []}

component_performance['Component'].append('Overall')
component_performance['F1 Score (Validation)'].append(
    eval_tuned_clf(
        'RF', 
        True,
        FEATURE_SETS['all_selected']
    )[1]
)

component_performance['Component'].append('Temperature Features')
component_performance['F1 Score (Validation)'].append(
    eval_tuned_clf(
        'RF', 
        True,
        FEATURE_SETS['precipitation_selected'] + FEATURE_SETS['geospatial_selected'] + FEATURE_SETS['geographic_selected']
    )[1]
)

component_performance['Component'].append('Geographic Features')
component_performance['F1 Score (Validation)'].append(
    eval_tuned_clf(
        'RF', 
        True,
        FEATURE_SETS['precipitation_selected'] + FEATURE_SETS['geospatial_selected']
    )[1]
)

component_performance['Component'].append('Geospatial Features')
component_performance['F1 Score (Validation)'].append(
    eval_tuned_clf(
        'RF', 
        True,
        FEATURE_SETS['precipitation_selected']
    )[1]
)

pd.DataFrame.from_dict(component_performance)

== train f1: 0.9170853884263325
== val f1: 0.4393614303959132
== train f1: 0.9098588993239871
== val f1: 0.4114285714285714
== train f1: 0.8916454298335034
== val f1: 0.3844444444444444
== train f1: 0.96897360131663
== val f1: 0.34923076923076923


Unnamed: 0,Component,F1 Score (Validation)
0,Overall,0.439361
1,Temperature Features,0.411429
2,Geographic Features,0.384444
3,Geospatial Features,0.349231


In [8]:
X_final_train = pd.concat([X_train, X_val])
y_final_train = pd.concat([y_train, y_val])

In [114]:
# baseline & raw feature set
clf1 = RandomForestClassifier(class_weight='balanced', random_state=0)
clf1.fit(X_final_train[FEATURE_SETS['all']], y_final_train)
preds1 = clf1.predict(X_test[FEATURE_SETS['all']]) 
print(classification_report(y_test, preds1))

              precision    recall  f1-score   support

           0       0.67      0.56      0.61        54
           1       0.32      0.52      0.39        25
           2       0.00      0.00      0.00         7

    accuracy                           0.50        86
   macro avg       0.33      0.36      0.33        86
weighted avg       0.51      0.50      0.50        86



  _warn_prf(average, modifier, msg_start, len(result))


In [115]:
# hyperparams tuned & raw feature set
clf2 = GridSearchCV(
    classifiers['RF'], 
    params['RF'],
    scoring='f1_macro'
)
clf2.fit(X_final_train[FEATURE_SETS['all']], y_final_train)
preds2 = clf2.predict(X_test[FEATURE_SETS['all']])
print(classification_report(y_test, preds2))

              precision    recall  f1-score   support

           0       0.64      0.87      0.74        54
           1       0.38      0.20      0.26        25
           2       0.00      0.00      0.00         7

    accuracy                           0.60        86
   macro avg       0.34      0.36      0.33        86
weighted avg       0.52      0.60      0.54        86



  _warn_prf(average, modifier, msg_start, len(result))


In [116]:
# hyperparams tuned & restricted feature set 
clf3 = GridSearchCV(
    classifiers['RF'], 
    params['RF'],
    scoring='f1_macro'
)
clf3.fit(X_final_train[FEATURE_SETS['all_selected']], y_final_train)
preds3 = clf3.predict(X_test[FEATURE_SETS['all_selected']])
print(classification_report(y_test, preds3))

              precision    recall  f1-score   support

           0       0.63      0.87      0.73        54
           1       0.27      0.12      0.17        25
           2       0.00      0.00      0.00         7

    accuracy                           0.58        86
   macro avg       0.30      0.33      0.30        86
weighted avg       0.47      0.58      0.51        86



  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
y_test

227    0
208    0
164    0
277    0
62     0
      ..
291    0
245    0
426    0
312    1
80     0
Name: grade, Length: 86, dtype: int64

In [117]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, preds3, labels=[0, 1, 2])

array([[47,  7,  0],
       [22,  3,  0],
       [ 6,  1,  0]])

In [118]:
X_train

Unnamed: 0,altitude_m,nearest_lake_dist_km,days_of_rain,rainfall,avg_daily_temp,min_daily_temp,max_daily_temp,sea_level_pressure,global_radiation,50m_gradient,100m_gradient,500m_gradient
272,0.101568,0.082689,0.495413,0.123974,0.160250,0.109352,0.199800,0.988236,0.452508,0.107399,0.092715,0.013918
389,0.338095,0.047371,0.100917,0.164450,0.440200,0.394930,0.464212,0.996184,0.712485,0.304296,0.322697,0.260212
42,0.470857,0.019447,0.477064,0.492799,0.199337,0.266839,0.187065,0.989758,0.388188,0.337709,0.358820,0.231467
55,0.058667,0.078136,0.477064,0.365225,0.330356,0.372033,0.333326,0.987712,0.451133,0.455847,0.446117,0.275340
178,0.014160,0.172460,0.477064,0.353713,0.286924,0.332251,0.279623,0.987483,0.429590,0.516706,0.491872,0.423601
...,...,...,...,...,...,...,...,...,...,...,...,...
118,0.136058,0.008697,0.311927,0.110122,0.678621,0.616651,0.691280,0.996206,0.766040,0.085919,0.072848,0.018911
190,0.195021,0.064391,0.477064,0.270467,0.198860,0.233351,0.220688,0.988122,0.460388,0.514320,0.467188,0.378215
347,0.011370,0.059963,0.403670,0.151470,0.360016,0.382319,0.354182,0.988394,0.477429,0.137232,0.118603,0.059909
20,0.140496,0.341453,0.183486,0.139820,0.780165,0.777796,0.738318,0.996702,0.827515,0.211217,0.229380,0.095008


In [119]:
y_train

272    0
389    0
42     0
55     2
178    2
      ..
118    1
190    1
347    1
20     0
271    0
Name: grade, Length: 275, dtype: int64

In [None]:
# Learning curve
clf = RandomForestClassifier(class_weight='balanced', random_state=0)
clf.fit(X_train[FEATURE_SETS['all_selected']], y_train)
f1_score_train = f1_score(
    clf.predict(X_train[FEATURE_SETS['all_selected']]), 
    y_train, 
    average='macro'
)
f1_score_val = f1_score(
    clf.predict(X_val[FEATURE_SETS['all_selected']]), 
    y_val, 
    average='macro'
)