In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()

In [2]:
cols = [
    "age", "sex", "cholesterol", "max_press", "smokes", "score"
]

In [3]:
path = "/home/marco/git/webvalley/wv-score-api/train.csv"

In [4]:
all_data = pd.read_csv(path, usecols=cols)

In [5]:
all_data.head()

Unnamed: 0,age,sex,cholesterol,max_press,smokes,score
0,50,M,6,120,True,1.74
1,90,M,5,165,True,62.5
2,47,F,9,150,False,0.48
3,83,M,10,147,False,36.08
4,68,M,9,166,True,32.76


In [6]:
all_data.dtypes

age              int64
sex             object
cholesterol      int64
max_press        int64
smokes            bool
score          float64
dtype: object

In [7]:
le = preprocessing.LabelEncoder()

In [8]:
all_data["sex"] = le.fit_transform(all_data["sex"])
all_data["smokes"] = le.fit_transform(all_data["smokes"])

In [9]:
train_cols = cols[:-1]
data = all_data[train_cols]
target = all_data["score"]

In [10]:
x_tr, x_ts, y_tr, y_ts = train_test_split(data, target, test_size=0.30, random_state=10)

In [11]:
pca = PCA()

In [12]:
z_tr = pca.fit_transform(x_tr)
pca.fit(x_tr)
z_tr = pca.transform(x_tr)
z_tr.shape

(700, 5)

In [13]:
print(pca.explained_variance_ratio_)

[7.40618624e-01 2.53310323e-01 5.64806421e-03 2.12579240e-04
 2.10408685e-04]


In [14]:
medium_risk = np.logical_and(y_tr>=5, y_tr<=9)

In [15]:
p = figure(plot_width=400, plot_height=400, title="PCA of Train data")
p.circle(z_tr[y_tr<=1, 0], z_tr[y_tr<=1, 1], line_color="green", fill_color="green")

p.circle(z_tr[medium_risk, 0], z_tr[medium_risk, 1], line_color="orange", fill_color="orange")

p.circle(z_tr[y_tr>=15, 0], z_tr[y_tr>=15, 1], line_color="darkred", fill_color="darkred")
p.xaxis.axis_label = "PC1"
p.yaxis.axis_label = "PC2"
show(p)