In [138]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.model_selection import train_test_split

In [139]:
DATA_DIR = '../../data'
BASE_DATASET =  f'{DATA_DIR}/hydropower_efficiency.csv'
OUTPUT_DATASET = f'{DATA_DIR}/hydropower_efficiency.discretized_labels.csv'

SEED = 1
N_BINS = 5

In [140]:
base_df = pd.read_csv(BASE_DATASET)
base_df = base_df[base_df['gwh_per_mm3'] < 10] 

X, y = base_df.drop('gwh_per_mm3', axis=1), base_df['gwh_per_mm3']

In [141]:
X_learn, X_bins, y_learn, y_bins = train_test_split(X, y, test_size=0.2, random_state=SEED)
y_learn = np.array(y_learn).reshape(-1, 1)
y_bins = np.array(y_bins).reshape(-1, 1)

In [142]:
est = KBinsDiscretizer(n_bins=N_BINS, encode='ordinal', strategy='kmeans')
est.fit(y_bins)

KBinsDiscretizer(encode='ordinal', n_bins=5, strategy='kmeans')

In [143]:
learn_df = X_learn.copy(deep=True)
learn_df.insert(loc=1, column='gwh_per_mm3', value=y_learn)
learn_df.insert(loc=2, column='grade', value=[int(n) for n in est.transform(y_learn)])
learn_df.reset_index(drop=True, inplace=True)

In [144]:
for i in range(N_BINS):
    print(f'=== {i} ===')
    print(learn_df[learn_df['grade'] == i]['gwh_per_mm3'].describe())
    print('===========')

=== 0 ===
count    256.000000
mean       0.595697
std        0.384637
min        0.004603
25%        0.286726
50%        0.531616
75%        0.834834
max        1.513897
Name: gwh_per_mm3, dtype: float64
=== 1 ===
count    105.000000
mean       2.321999
std        0.561950
min        1.518838
25%        1.840425
50%        2.220588
75%        2.724138
max        3.500000
Name: gwh_per_mm3, dtype: float64
=== 2 ===
count    44.000000
mean      4.617506
std       0.700318
min       3.611111
25%       4.105541
50%       4.566386
75%       5.130390
max       5.891892
Name: gwh_per_mm3, dtype: float64
=== 3 ===
count    17.000000
mean      7.173310
std       0.664270
min       6.031850
25%       6.689032
50%       7.155635
75%       7.513347
max       8.475610
Name: gwh_per_mm3, dtype: float64
=== 4 ===
count    8.000000
mean     9.072615
std      0.366593
min      8.495562
25%      8.858307
50%      9.131206
75%      9.267495
max      9.622222
Name: gwh_per_mm3, dtype: float64


In [145]:
learn_df

Unnamed: 0,gwh_per_mm3,grade,plant_id,latitude,longitude,type,altitude_m,nearest_lake_dist_km,days_of_rain,inches_of_rain,avg_high_temp,avg_low_temp
0,7.057746,3,H215,45.009998,14.893000,HDAM,24.2,19.023055,115,53.9,59,43
1,0.157231,0,N660,60.101650,6.913308,HDAM,1200.0,0.887112,113,30.0,49,36
2,9.232323,4,H549,66.155998,25.125000,HDAM,40.0,7.740203,115,26.9,48,35
3,0.590909,0,N717,58.662011,8.062454,HDAM,211.2,0.362976,113,30.0,49,36
4,0.460972,0,N34,59.353128,7.247817,HDAM,913.6,3.112441,113,30.0,49,36
...,...,...,...,...,...,...,...,...,...,...,...,...
425,1.562500,1,H435,46.443064,9.334613,HDAM,1249.0,4.257927,120,37.3,58,42
426,0.638539,0,H529,46.175999,8.684000,HDAM,696.5,12.215937,83,36.2,64,47
427,0.585073,0,H18,39.730301,-6.884900,HDAM,219.3,5.292871,77,28.6,70,56
428,3.026489,1,N50,60.129931,6.641965,HDAM,524.3,0.924697,113,30.0,49,36


In [146]:
learn_df.to_csv(OUTPUT_DATASET, index=False)