In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score
output_notebook()

In [2]:
np.random.seed(42)

In [3]:
cols = [
    "age", "sex", "cholesterol", "max_press", "smokes", "score"
]

In [4]:
path = "/home/marco/git/webvalley/wv-score-api/train.csv"

In [5]:
all_data = pd.read_csv(path, usecols=cols)

In [6]:
all_data.head()

Unnamed: 0,age,sex,cholesterol,max_press,smokes,score
0,50,M,6,120,True,1.74
1,62,F,5,103,False,0.8
2,75,F,10,163,False,19.62
3,61,F,2,134,True,1.86
4,74,F,3,159,False,6.82


In [7]:
all_data.tail()

Unnamed: 0,age,sex,cholesterol,max_press,smokes,score
9995,84,M,6,102,True,17.11
9996,91,M,3,165,True,53.13
9997,50,F,5,128,True,0.53
9998,87,M,7,119,True,31.74
9999,40,M,5,84,False,0.1


In [8]:
all_data.dtypes

age              int64
sex             object
cholesterol      int64
max_press        int64
smokes            bool
score          float64
dtype: object

In [9]:
le = preprocessing.LabelEncoder()

In [10]:
all_data["sex"] = le.fit_transform(all_data["sex"])
all_data["smokes"] = le.fit_transform(all_data["smokes"])

In [11]:
train_cols = cols[:-1]
data = all_data[train_cols]
target = all_data["score"]

In [12]:
x_tr, x_ts, y_tr, y_ts = train_test_split(data, target, test_size=0.30, random_state=10)

In [13]:
pca = PCA()

In [14]:
z_tr = pca.fit_transform(x_tr)
pca.fit(x_tr)
z_tr = pca.transform(x_tr)
z_tr.shape

(7000, 5)

In [15]:
print(pca.explained_variance_ratio_)

[7.30894437e-01 2.62964437e-01 5.71242710e-03 2.14493021e-04
 2.14206755e-04]


In [16]:
medium_risk = np.logical_and(y_tr>=5, y_tr<=9)

In [17]:
p = figure(plot_width=400, plot_height=400, title="PCA of Train data")
p.circle(z_tr[y_tr<=1, 0], z_tr[y_tr<=1, 1], line_color="green", fill_color="green")

p.circle(z_tr[medium_risk, 0], z_tr[medium_risk, 1], line_color="orange", fill_color="orange")

p.circle(z_tr[y_tr>=15, 0], z_tr[y_tr>=15, 1], line_color="darkred", fill_color="darkred")
p.xaxis.axis_label = "PC1"
p.yaxis.axis_label = "PC2"
show(p)

In [18]:
print('Training Features Shape:', x_tr.shape)
print('Training Labels Shape:', y_tr.shape)
print('Testing Features Shape:', x_ts.shape)
print('Testing Labels Shape:', y_ts.shape)

Training Features Shape: (7000, 5)
Training Labels Shape: (7000,)
Testing Features Shape: (3000, 5)
Testing Labels Shape: (3000,)


In [19]:
rf = RandomForestRegressor(n_estimators=100)

In [20]:
skf = KFold(n_splits=5, shuffle=True, random_state=42)

In [21]:
x_tr

Unnamed: 0,age,sex,cholesterol,max_press,smokes
5663,51,1,6,129,0
2840,41,0,5,159,1
4550,68,0,3,94,1
17,84,0,3,151,0
3442,57,0,3,113,0
6765,83,1,5,174,1
9799,63,0,2,93,1
5515,94,0,4,140,1
4479,60,1,8,144,1
7859,63,0,5,116,0


In [22]:
x_tr_np = x_tr.values
y_tr_np = y_tr.values

In [23]:
x_tr_np

array([[ 51,   1,   6, 129,   0],
       [ 41,   0,   5, 159,   1],
       [ 68,   0,   3,  94,   1],
       ...,
       [ 75,   0,  10,  92,   1],
       [ 45,   0,   9,  84,   1],
       [ 47,   0,   9, 163,   1]])

In [24]:
maes = []
r2s = []

i = 0

for (idx_tr, idx_ts) in skf.split(x_tr_np, y_tr_np):
    print(f"### Fold {i+1} ###")
    X_train, Y_train = x_tr_np[idx_tr], y_tr_np[idx_tr]
    X_test, Y_test = x_tr_np[idx_ts], y_tr_np[idx_ts]
    rf.fit(X_train, Y_train)
    Y_test_pred = rf.predict(X_test)
    mae = mean_absolute_error(Y_test, Y_test_pred)
    print("MAE:", mae)
    maes.append(mae)
    r2 = r2_score(Y_test, Y_test_pred)
    print("R2:", r2)
    r2s.append(r2)
    i += 1
    
    
print("\n\n\tAVGS:")
print("MAE:", sum(maes)/len(maes))
print("R2S:", sum(r2s)/len(r2s))

### Fold 1 ###
MAE: 0.889813642857142
R2: 0.9921841371579513
### Fold 2 ###
MAE: 0.9063400714285705
R2: 0.9926017737313149
### Fold 3 ###
MAE: 0.9107578571428573
R2: 0.9926898670598153
### Fold 4 ###
MAE: 0.8980645714285715
R2: 0.9928757892087474
### Fold 5 ###
MAE: 0.9564479999999996
R2: 0.9930651030434156


	AVGS:
MAE: 0.9122848285714283
R2S: 0.9926833340402489


# Let's try to mess up a bit to discover a maximum value of MAE

In [25]:
y_tr_np

array([ 1.16,  0.21,  1.96, ..., 10.32,  0.19,  1.21])

In [26]:
np.random.shuffle(y_tr_np)

In [27]:
y_tr_np

array([19.09,  7.6 ,  1.86, ..., 21.54,  3.36,  2.31])

In [28]:
maes = []
r2s = []

i = 0

for (idx_tr, idx_ts) in skf.split(x_tr_np, y_tr_np):
    print(f"### Fold {i+1} ###")
    X_train, Y_train = x_tr_np[idx_tr], y_tr_np[idx_tr]
    X_test, Y_test = x_tr_np[idx_ts], y_tr_np[idx_ts]
    rf.fit(X_train, Y_train)
    Y_test_pred = rf.predict(X_test)
    mae = mean_absolute_error(Y_test, Y_test_pred)
    print("MAE:", mae)
    maes.append(mae)
    r2 = r2_score(Y_test, Y_test_pred)
    print("R2:", r2)
    r2s.append(r2)
    i += 1
    
    
print("\n\n\tAVGS:")
print("MAE:", sum(maes)/len(maes))
print("R2S:", sum(r2s)/len(r2s))

### Fold 1 ###
MAE: 16.60734868319161
R2: -0.11798780316386925
### Fold 2 ###
MAE: 16.22397109914966
R2: -0.14092402106992719
### Fold 3 ###
MAE: 15.566486894387754
R2: -0.16996203751951522
### Fold 4 ###
MAE: 16.964435725595237
R2: -0.16206238871958112
### Fold 5 ###
MAE: 16.24197780833333
R2: -0.1459561166873451


	AVGS:
MAE: 16.32084404213152
R2S: -0.14737847343204757
