In [1]:
import pandas as pd
import numpy as np
from pandas.plotting import register_matplotlib_converters

In [2]:
register_matplotlib_converters()
original = pd.read_csv('../../data/algae.csv', index_col='date', sep=',', decimal='.', parse_dates=True, infer_datetime_format=True)

sb_vars = original.select_dtypes(include='object')
original[sb_vars.columns] = original.select_dtypes(['object']).apply(lambda x: x.astype('category'))

cols_nr = original.select_dtypes(include='number')
cols_sb = original.select_dtypes(include='category')

original.describe(include='all')

#Has missing values (counts), need to be filled in

Unnamed: 0,pH,Oxygen,Chloride,Nitrates,Ammonium,Orthophosphate,Phosphate,Chlorophyll,fluid_velocity,river_depth,season
count,199.0,198.0,190.0,198.0,198.0,198.0,198.0,188.0,200,200,200
unique,,,,,,,,,3,3,4
top,,,,,,,,,high,medium,winter
freq,,,,,,,,,84,84,62
mean,8.011759,9.117778,43.636842,3.282828,154.447475,83.325909,111.550808,13.541011,,,
std,0.598302,2.391253,46.830993,3.776458,180.011207,116.783165,102.214088,20.448323,,,
min,5.6,1.5,0.22,0.05,5.0,1.0,0.9,0.0,,,
25%,7.7,7.725,10.985,1.2975,35.625,16.0,19.395,2.0,,,
50%,8.06,9.8,32.73,2.675,99.665,41.4,84.5,5.2,,,
75%,8.4,10.8,57.825,4.4475,203.73,102.2475,182.16,18.3,,,


In [4]:
original.isna().sum()

pH                 1
Oxygen             2
Chloride          10
Nitrates           2
Ammonium           2
Orthophosphate     2
Phosphate          2
Chlorophyll       12
fluid_velocity     0
river_depth        0
season             0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='constant', fill_value='NA', missing_values=np.nan, copy=True)
imp.fit(original.values)
mat = imp.transform(original.values)
data = pd.DataFrame(mat, columns=original.columns)
data.describe(include='all')

#replaces missing values with numerical ones but fuckes up the mean, std, min etc

Unnamed: 0,pH,Oxygen,Chloride,Nitrates,Ammonium,Orthophosphate,Phosphate,Chlorophyll,fluid_velocity,river_depth,season
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200,200,200
unique,72.0,89.0,179.0,162.0,179.0,175.0,188.0,131.0,3,3,4
top,8.3,9.8,,1.72,10.0,1.0,6.0,,high,medium,winter
freq,14.0,8.0,10.0,3.0,9.0,5.0,4.0,12.0,84,84,62


In [6]:
imp_nr = SimpleImputer(strategy='mean', missing_values=np.nan, copy=True)
imp_sb = SimpleImputer(strategy='most_frequent', missing_values='', copy=True)
df_nr = pd.DataFrame(imp_nr.fit_transform(cols_nr), columns=cols_nr.columns)
df_sb = pd.DataFrame(imp_sb.fit_transform(cols_sb), columns=cols_sb.columns)

data = df_nr.join(df_sb, how='right')
data.describe(include='all')

#no missing values anymore (count)

Unnamed: 0,pH,Oxygen,Chloride,Nitrates,Ammonium,Orthophosphate,Phosphate,Chlorophyll,fluid_velocity,river_depth,season
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200,200,200
unique,,,,,,,,,3,3,4
top,,,,,,,,,high,medium,winter
freq,,,,,,,,,84,84,62
mean,8.011759,9.117778,43.636842,3.282828,154.447475,83.325909,111.550808,13.541011,,,
std,0.596797,2.379206,45.639169,3.757432,179.104344,116.194833,101.699153,19.822204,,,
min,5.6,1.5,0.22,0.05,5.0,1.0,0.9,0.0,,,
25%,7.7,7.775,11.0675,1.3125,36.875,16.0,20.185,2.0,,,
50%,8.055,9.8,36.0,2.74,101.125,43.085,85.2,5.55,,,
75%,8.4,10.8,56.9775,4.4225,199.85,102.0825,179.14,17.2,,,


In [7]:
data.isna().sum()

pH                0
Oxygen            0
Chloride          0
Nitrates          0
Ammonium          0
Orthophosphate    0
Phosphate         0
Chlorophyll       0
fluid_velocity    0
river_depth       0
season            0
dtype: int64

# normalization

In [8]:
from sklearn.preprocessing import Normalizer

In [9]:
transf = Normalizer().fit(df_nr)
df_nr = pd.DataFrame(transf.transform(df_nr, copy=True), columns= df_nr.columns)
norm_data = df_nr.join(df_sb, how='right')
norm_data.describe(include='all')

Unnamed: 0,pH,Oxygen,Chloride,Nitrates,Ammonium,Orthophosphate,Phosphate,Chlorophyll,fluid_velocity,river_depth,season
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200,200,200
unique,,,,,,,,,3,3,4
top,,,,,,,,,high,medium,winter
freq,,,,,,,,,84,84,62
mean,0.082241,0.104437,0.221043,0.022083,0.587137,0.299595,0.490879,0.076935,,,
std,0.097797,0.132541,0.199691,0.035132,0.247992,0.210457,0.258793,0.10895,,,
min,0.007915,0.003339,0.006633,0.00029,0.087508,0.01423,0.002641,0.0,,,
25%,0.024631,0.022504,0.091531,0.008413,0.410054,0.14026,0.286563,0.014384,,,
50%,0.043089,0.042227,0.149779,0.01434,0.586529,0.268944,0.538332,0.030744,,,
75%,0.093163,0.124953,0.265252,0.025148,0.775696,0.415019,0.710805,0.095915,,,


# Dummification

#### Change classes with 'high, medium, low' to new variables v_high, v_medium v_low with binary classification

In [10]:
from sklearn.preprocessing import OneHotEncoder

def dummify(df, cols_to_dummify):
    one_hot_encoder = OneHotEncoder(sparse=False)
    
    for var in cols_to_dummify:
        one_hot_encoder.fit(data[var].values.reshape(-1, 1))
        feature_names = one_hot_encoder.get_feature_names([var])
        transformed_data = one_hot_encoder.transform(data[var].values.reshape(-1, 1))
        df = pd.concat((df, pd.DataFrame(transformed_data, columns=feature_names)), 1)
        df.pop(var)
    return df
  
df = dummify(data, cols_sb.columns)
df.describe(include='all')

Unnamed: 0,pH,Oxygen,Chloride,Nitrates,Ammonium,Orthophosphate,Phosphate,Chlorophyll,fluid_velocity_high,fluid_velocity_low,fluid_velocity_medium,river_depth_high,river_depth_low,river_depth_medium,season_autumn,season_spring,season_summer,season_winter
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,8.011759,9.117778,43.636842,3.282828,154.447475,83.325909,111.550808,13.541011,0.42,0.165,0.415,0.225,0.355,0.42,0.2,0.265,0.225,0.31
std,0.596797,2.379206,45.639169,3.757432,179.104344,116.194833,101.699153,19.822204,0.494797,0.372112,0.493958,0.41863,0.479714,0.494797,0.401004,0.442441,0.41863,0.463654
min,5.6,1.5,0.22,0.05,5.0,1.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.7,7.775,11.0675,1.3125,36.875,16.0,20.185,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.055,9.8,36.0,2.74,101.125,43.085,85.2,5.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.4,10.8,56.9775,4.4225,199.85,102.0825,179.14,17.2,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
max,9.7,13.4,391.5,45.65,931.83,771.6,558.75,110.46,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Balancing

In [11]:
import matplotlib.pyplot as plt

unbal = pd.read_csv('../../data/unbalanced.csv', sep=',', decimal='.')
target_count = unbal['Outcome'].value_counts()
plt.figure()
plt.title('Class balance')
plt.bar(target_count.index, target_count.values)
plt.show()

min_class = target_count.idxmin()
ind_min_class = target_count.index.get_loc(min_class)

print('Minority class:', target_count[ind_min_class])
print('Majority class:', target_count[1-ind_min_class])
print('Proportion:', round(target_count[ind_min_class] / target_count[1-ind_min_class], 2), ': 1')

<Figure size 640x480 with 1 Axes>

Minority class: 12
Majority class: 844
Proportion: 0.01 : 1


In [12]:

from imblearn.over_sampling import SMOTE, RandomOverSampler

RANDOM_STATE = 42
values = {'Original': [target_count.values[ind_min_class], target_count.values[1-ind_min_class]]}

df_class_min = unbal[unbal['Outcome'] == min_class]
df_class_max = unbal[unbal['Outcome'] != min_class] 

df_under = df_class_max.sample(len(df_class_min))
values['UnderSample'] = [target_count.values[ind_min_class], len(df_under)]

df_over = df_class_min.sample(len(df_class_max), replace=True)
values['OverSample'] = [len(df_over), target_count.values[1-ind_min_class]]

smote = SMOTE(ratio='minority', random_state=RANDOM_STATE)
y = unbal.pop('Outcome').values
X = unbal.values
_, smote_y = smote.fit_sample(X, y)
smote_target_count = pd.Series(smote_y).value_counts()
values['SMOTE'] = [smote_target_count.values[ind_min_class], smote_target_count.values[1-ind_min_class]]

plt.figure()
func.multiple_bar_chart(plt.gca(), 
                        [target_count.index[ind_min_class], target_count.index[1-ind_min_class]], 
                        values, 'Target', 'frequency', 'Class balance')
plt.show()

ModuleNotFoundError: No module named 'imblearn'