In [33]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.model_selection import train_test_split

In [34]:
DATA_DIR = '../../data'
BASE_DATASET =  f'{DATA_DIR}/hydropower_efficiency.csv'
OUTPUT_DATASET = f'{DATA_DIR}/hydropower_efficiency.discretized_labels.csv'

SEED = 1
N_BINS = 3

In [35]:
base_df = pd.read_csv(BASE_DATASET)
base_df = base_df[base_df['gwh_per_mm3'] < 10] 

X, y = base_df.drop('gwh_per_mm3', axis=1), base_df['gwh_per_mm3']

In [36]:
X_learn, X_bins, y_learn, y_bins = train_test_split(X, y, test_size=0.2, random_state=SEED)
y_learn = np.array(y_learn).reshape(-1, 1)
y_bins = np.array(y_bins).reshape(-1, 1)

In [37]:
est = KBinsDiscretizer(n_bins=N_BINS, encode='ordinal', strategy='kmeans')
est.fit(y_bins)

KBinsDiscretizer(encode='ordinal', n_bins=3, strategy='kmeans')

In [38]:
learn_df = X_learn.copy(deep=True)
learn_df.insert(loc=1, column='gwh_per_mm3', value=y_learn)
learn_df.insert(loc=2, column='grade', value=[int(n) for n in est.transform(y_learn)])
learn_df.reset_index(drop=True, inplace=True)

In [39]:
for i in range(N_BINS):
    print(f'=== {i} ===')
    print(learn_df[learn_df['grade'] == i]['gwh_per_mm3'].describe())
    print('===========')

=== 0 ===
count    269.000000
mean       0.644380
std        0.433268
min        0.004603
25%        0.291017
50%        0.550314
75%        0.907716
max        1.669091
Name: gwh_per_mm3, dtype: float64
=== 1 ===
count    119.000000
mean       2.815515
std        0.878610
min        1.674901
25%        2.063000
50%        2.641509
75%        3.402536
max        4.721429
Name: gwh_per_mm3, dtype: float64
=== 2 ===
count    42.000000
mean      6.800507
std       1.475250
min       4.738186
25%       5.596164
50%       6.682965
75%       7.670204
max       9.622222
Name: gwh_per_mm3, dtype: float64


In [40]:
learn_df

Unnamed: 0,plant_id,gwh_per_mm3,grade,latitude,longitude,type,altitude_m,nearest_lake_dist_km,days_of_rain,avg_daily_temp,min_daily_temp,max_daily_temp,sea_level_pressure,global_radiation,rainfall,50m_gradient,100m_gradient,500m_gradient
0,H215,7.057746,2,45.009998,14.893000,HDAM,24.2,19.023055,115,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11.5,26.1,160.0
1,N660,0.157231,0,60.101650,6.913308,HDAM,1200.0,0.887112,113,-0.620301,-2.904493,1.871260,1010.741656,81.112329,3.855617,8.2,16.5,90.1
2,H549,9.232323,2,66.155998,25.125000,HDAM,40.0,7.740203,115,2.037342,-2.055534,5.873260,1008.752616,94.884932,1.338904,2.0,2.0,7.0
3,N717,0.590909,0,58.662011,8.062454,HDAM,211.2,0.362976,113,5.186438,2.013479,9.308356,1011.109054,89.553425,4.081918,22.3,45.3,273.0
4,N34,0.460972,0,59.353128,7.247817,HDAM,913.6,3.112441,113,1.806959,-1.762575,5.385260,1011.281383,78.852055,4.029315,13.4,26.1,127.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,H435,1.562500,0,46.443064,9.334613,HDAM,1249.0,4.257927,120,3.540712,0.206986,7.325233,1018.488784,155.958904,3.814521,2.3,13.1,319.0
426,H529,0.638539,0,46.175999,8.684000,HDAM,696.5,12.215937,83,9.727589,6.296712,13.482054,1017.912616,156.421918,3.314795,16.3,33.0,160.0
427,H18,0.585073,0,39.730301,-6.884900,HDAM,219.3,5.292871,77,17.692109,10.797918,24.857616,1018.292342,206.463014,0.732055,-1.1,-1.7,22.5
428,N50,3.026489,1,60.129931,6.641965,HDAM,524.3,0.924697,113,1.689507,-0.772356,4.546822,1010.617000,79.632877,5.887397,33.9,84.9,477.0


In [41]:
learn_df

Unnamed: 0,plant_id,gwh_per_mm3,grade,latitude,longitude,type,altitude_m,nearest_lake_dist_km,days_of_rain,avg_daily_temp,min_daily_temp,max_daily_temp,sea_level_pressure,global_radiation,rainfall,50m_gradient,100m_gradient,500m_gradient
0,H215,7.057746,2,45.009998,14.893000,HDAM,24.2,19.023055,115,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11.5,26.1,160.0
1,N660,0.157231,0,60.101650,6.913308,HDAM,1200.0,0.887112,113,-0.620301,-2.904493,1.871260,1010.741656,81.112329,3.855617,8.2,16.5,90.1
2,H549,9.232323,2,66.155998,25.125000,HDAM,40.0,7.740203,115,2.037342,-2.055534,5.873260,1008.752616,94.884932,1.338904,2.0,2.0,7.0
3,N717,0.590909,0,58.662011,8.062454,HDAM,211.2,0.362976,113,5.186438,2.013479,9.308356,1011.109054,89.553425,4.081918,22.3,45.3,273.0
4,N34,0.460972,0,59.353128,7.247817,HDAM,913.6,3.112441,113,1.806959,-1.762575,5.385260,1011.281383,78.852055,4.029315,13.4,26.1,127.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,H435,1.562500,0,46.443064,9.334613,HDAM,1249.0,4.257927,120,3.540712,0.206986,7.325233,1018.488784,155.958904,3.814521,2.3,13.1,319.0
426,H529,0.638539,0,46.175999,8.684000,HDAM,696.5,12.215937,83,9.727589,6.296712,13.482054,1017.912616,156.421918,3.314795,16.3,33.0,160.0
427,H18,0.585073,0,39.730301,-6.884900,HDAM,219.3,5.292871,77,17.692109,10.797918,24.857616,1018.292342,206.463014,0.732055,-1.1,-1.7,22.5
428,N50,3.026489,1,60.129931,6.641965,HDAM,524.3,0.924697,113,1.689507,-0.772356,4.546822,1010.617000,79.632877,5.887397,33.9,84.9,477.0


In [42]:
learn_df.to_csv(OUTPUT_DATASET, index=False)