- Import required libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

- load the processed dataset

In [2]:
bp_data = pd.read_csv('datasets/bp_data_processed.csv')

bp_data.head()

Unnamed: 0,age,education,sex,is_smoking,cigsPerDay,prevalentStroke,diabetes,totChol,diaBP,BMI,heartRate,glucose
0,36,4,1,0,0,0,0,212.0,98.0,29.77,72.0,75.0
1,46,1,0,1,10,0,0,250.0,71.0,20.35,88.0,94.0
2,50,1,1,1,20,0,0,233.0,88.0,28.26,68.0,94.0
3,64,1,0,1,30,0,0,241.0,85.0,26.42,70.0,77.0
4,61,3,0,0,0,0,0,272.0,121.0,32.8,85.0,65.0


In [3]:
X = bp_data.drop('diaBP', axis=1)

y = bp_data['diaBP']

In [4]:
X.head()

Unnamed: 0,age,education,sex,is_smoking,cigsPerDay,prevalentStroke,diabetes,totChol,BMI,heartRate,glucose
0,36,4,1,0,0,0,0,212.0,29.77,72.0,75.0
1,46,1,0,1,10,0,0,250.0,20.35,88.0,94.0
2,50,1,1,1,20,0,0,233.0,28.26,68.0,94.0
3,64,1,0,1,30,0,0,241.0,26.42,70.0,77.0
4,61,3,0,0,0,0,0,272.0,32.8,85.0,65.0


- Creating train and test dataset from X and y

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

X_train.shape, X_test.shape

((2341, 11), (586, 11))

### Scaling

In [6]:
X_train[['age', 'cigsPerDay', 'totChol', 'BMI', 'heartRate', 'glucose']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,2341.0,49.664673,8.620532,32.0,42.0,49.0,57.0,70.0
cigsPerDay,2341.0,8.999146,11.860442,0.0,0.0,0.0,20.0,70.0
totChol,2341.0,236.818881,44.038848,113.0,207.0,234.0,263.0,600.0
BMI,2341.0,25.726467,4.074252,15.96,22.99,25.33,27.97,51.28
heartRate,2341.0,75.860316,11.958713,45.0,68.0,75.0,83.0,140.0
glucose,2341.0,82.115762,24.485506,40.0,71.0,78.0,87.0,394.0


- Create  numerical dataframe from train feature data and test feature data
- Also create catagorical dataframe from train feature data and test feature data

In [7]:
X_train_numerical = X_train[['age', 'cigsPerDay','totChol', 'BMI', 'heartRate', 'glucose']]

X_test_numerical = X_test[['age', 'cigsPerDay', 'totChol', 'BMI', 'heartRate', 'glucose']]

In [8]:
X_train_categorical = X_train[['education', 'sex', 'is_smoking', 'prevalentStroke', 'diabetes']]

X_test_categorical = X_test[['education', 'sex', 'is_smoking', 'prevalentStroke', 'diabetes']]

- Scale train and test numerical dataframe by using standard scaler

In [9]:
scaler = StandardScaler()

X_train_numerical = pd.DataFrame(scaler.fit_transform(X_train_numerical),
                                 columns=X_train_numerical.columns)

X_test_numerical = pd.DataFrame(scaler.transform(X_test_numerical),
                                columns=X_test_numerical.columns)

In [10]:
X_train_numerical.describe()

Unnamed: 0,age,cigsPerDay,totChol,BMI,heartRate,glucose
count,2341.0,2341.0,2341.0,2341.0,2341.0,2341.0
mean,-2.154999e-16,-4.7045760000000007e-17,7.739786e-17,-4.158238e-16,4.143062e-16,-1.3658450000000001e-17
std,1.000214,1.000214,1.000214,1.000214,1.000214,1.000214
min,-2.049577,-0.7589151,-2.812184,-2.397631,-2.581123,-1.720396
25%,-0.8893083,-0.7589151,-0.6772487,-0.6717925,-0.6574282,-0.4540701
50%,-0.07711997,-0.7589151,-0.06402263,-0.09733125,-0.07195589,-0.1681257
75%,0.8510952,0.927723,0.5946276,0.5507789,0.5971553,0.1995173
max,2.359445,5.144318,8.248597,6.273297,5.364573,12.74023


In [11]:
X_test_numerical.describe()

Unnamed: 0,age,cigsPerDay,totChol,BMI,heartRate,glucose
count,586.0,586.0,586.0,586.0,586.0,586.0
mean,-0.091178,0.047851,0.035197,0.08654,0.011683,-0.037004
std,0.984081,1.009541,1.064206,1.065338,1.006225,0.920603
min,-1.93355,-0.758915,-2.267094,-1.968013,-2.413845,-1.434451
25%,-0.889308,-0.758915,-0.739707,-0.629444,-0.657428,-0.484707
50%,-0.193147,-0.674583,-0.029955,-0.027365,-0.071956,-0.208975
75%,0.619041,0.927723,0.662764,0.609698,0.346239,0.158668
max,2.243418,3.45768,5.159755,7.628436,5.615489,11.678146


- Join both scaled numerical dataframe and categorical dataframe to get scaled feature train and test data

In [12]:
X_train_categorical.reset_index(drop=True, inplace=True)
X_train_numerical.reset_index(drop=True, inplace=True)

X_train = pd.concat( [ X_train_numerical, X_train_categorical], axis=1)

X_train.head()

Unnamed: 0,age,cigsPerDay,totChol,BMI,heartRate,glucose,education,sex,is_smoking,prevalentStroke,diabetes
0,-1.005335,-0.758915,-0.699961,-0.85346,0.764433,-0.127276,4,0,0,0,0
1,1.663284,0.506063,1.048869,-1.118596,-0.322873,-0.576618,2,0,1,0,0
2,0.503015,-0.758915,0.821748,0.81837,2.855406,-0.372372,1,0,0,0,0
3,-1.353416,-0.758915,0.049538,1.905918,-0.071956,-0.086427,2,0,0,0,0
4,0.503015,-0.505919,-1.449459,0.118705,-0.071956,-0.576618,1,1,1,0,0


In [13]:
X_test_categorical.reset_index(drop=True, inplace=True)
X_test_numerical.reset_index(drop=True, inplace=True)

X_test = pd.concat([X_test_numerical, X_test_categorical], axis=1)

X_test.head()

Unnamed: 0,age,cigsPerDay,totChol,BMI,heartRate,glucose,education,sex,is_smoking,prevalentStroke,diabetes
0,-1.121362,2.867357,2.002776,-0.15625,0.178961,0.485462,1,1,1,0,0
1,-1.005335,1.771042,-0.881657,-1.035127,0.764433,0.07697,1,1,1,0,0
2,0.154934,-0.758915,0.571915,1.243078,-0.071956,-0.208975,2,0,0,0,0
3,-1.005335,-0.758915,0.84446,-0.075237,0.346239,-0.249824,2,1,0,0,0
4,-1.353416,7.2e-05,-0.381992,-0.254449,2.855406,0.199517,1,0,1,0,0


### Build Model

In [34]:
def build_model(regressor, X_train, y_train, X_test, y_test):
    
    model = regressor.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    training_score = model.score(X_train, y_train)
    testing_score = model.score(X_test, y_test)
    
    result_dict = {'Training Score' :  training_score,
                   'Testing Score ' :  testing_score}
    
    return  result_dict

In [35]:
def performance_metrics_report():
    
    for key in report_log:
        print('-' * 50)
        print('Regression Model -',key)
        print('-' * 50)
        
        print()
        for score in report_log[key]:
            print(score, report_log[key][score])

- Linear Regression

In [36]:
report_log = dict()

In [37]:
report_log['Linear Regression'] = build_model(LinearRegression(), X_train, y_train, X_test, y_test)

performance_metrics_report()

--------------------------------------------------
Regression Model - Linear Regression
--------------------------------------------------

Training Score 0.20658019944281802
Testing Score  0.26247024419553355


In [38]:
report_log['KNeighbors'] = build_model(KNeighborsRegressor(), X_train, y_train, X_test, y_test)

performance_metrics_report()

--------------------------------------------------
Regression Model - Linear Regression
--------------------------------------------------

Training Score 0.20658019944281802
Testing Score  0.26247024419553355
--------------------------------------------------
Regression Model - KNeighbors
--------------------------------------------------

Training Score 0.3658631969080023
Testing Score  0.13526359205932392


In [39]:
report_log['SVR'] = build_model(SVR(), X_train, y_train, X_test, y_test)

performance_metrics_report()

--------------------------------------------------
Regression Model - Linear Regression
--------------------------------------------------

Training Score 0.20658019944281802
Testing Score  0.26247024419553355
--------------------------------------------------
Regression Model - KNeighbors
--------------------------------------------------

Training Score 0.3658631969080023
Testing Score  0.13526359205932392
--------------------------------------------------
Regression Model - SVR
--------------------------------------------------

Training Score 0.20313810618657524
Testing Score  0.23163118416071815


In [40]:
report_log['Random Forest'] = build_model(RandomForestRegressor(), X_train, y_train, X_test, y_test)

performance_metrics_report()

--------------------------------------------------
Regression Model - Linear Regression
--------------------------------------------------

Training Score 0.20658019944281802
Testing Score  0.26247024419553355
--------------------------------------------------
Regression Model - KNeighbors
--------------------------------------------------

Training Score 0.3658631969080023
Testing Score  0.13526359205932392
--------------------------------------------------
Regression Model - SVR
--------------------------------------------------

Training Score 0.20313810618657524
Testing Score  0.23163118416071815
--------------------------------------------------
Regression Model - Random Forest
--------------------------------------------------

Training Score 0.878718781273387
Testing Score  0.20498453717003629
