In [3]:
# importing data
import pandas as pd
import numpy as np
df = pd.read_csv('hypertension.csv')
df.head()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
0,1,39,0,0.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   male           4240 non-null   int64  
 1   age            4240 non-null   int64  
 2   currentSmoker  4240 non-null   int64  
 3   cigsPerDay     4211 non-null   float64
 4   BPMeds         4187 non-null   float64
 5   diabetes       4240 non-null   int64  
 6   totChol        4190 non-null   float64
 7   sysBP          4240 non-null   float64
 8   diaBP          4240 non-null   float64
 9   BMI            4221 non-null   float64
 10  heartRate      4239 non-null   float64
 11  glucose        3852 non-null   float64
 12  Risk           4240 non-null   int64  
dtypes: float64(8), int64(5)
memory usage: 430.8 KB


In [5]:
df.describe()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
count,4240.0,4240.0,4240.0,4211.0,4187.0,4240.0,4190.0,4240.0,4240.0,4221.0,4239.0,3852.0,4240.0
mean,0.429245,49.580189,0.494104,9.005937,0.029615,0.025708,236.699523,132.354599,82.897759,25.800801,75.878981,81.963655,0.310613
std,0.495027,8.572942,0.500024,11.922462,0.169544,0.15828,44.591284,22.0333,11.910394,4.07984,12.025348,23.954335,0.462799
min,0.0,32.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,1.0,20.0,0.0,0.0,263.0,144.0,90.0,28.04,83.0,87.0,1.0
max,1.0,70.0,1.0,70.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [6]:
# checking for null values
df.isna().sum()

male               0
age                0
currentSmoker      0
cigsPerDay        29
BPMeds            53
diabetes           0
totChol           50
sysBP              0
diaBP              0
BMI               19
heartRate          1
glucose          388
Risk               0
dtype: int64

In [7]:
# dropping null values
df.dropna(inplace = True)
df.reset_index(drop=True, inplace=True)

In [8]:
# checking for duplicate values
df.duplicated().sum()

0

- We can see that there are a few features with very high correlation so we drop them.

In [9]:
df = df.drop(['diabetes', 'currentSmoker'], axis=1)

In [10]:
num_cols = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
cat_cols = ['male', 'BPMeds']


In [11]:
# splitting data
y = df['Risk']
X = df.drop('Risk', axis = 1)


In [12]:
X

Unnamed: 0,male,age,cigsPerDay,BPMeds,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1,39,0.0,0.0,195.0,106.0,70.0,26.97,80.0,77.0
1,0,46,0.0,0.0,250.0,121.0,81.0,28.73,95.0,76.0
2,1,48,20.0,0.0,245.0,127.5,80.0,25.34,75.0,70.0
3,0,61,30.0,0.0,225.0,150.0,95.0,28.58,65.0,103.0
4,0,46,23.0,0.0,285.0,130.0,84.0,23.10,85.0,85.0
...,...,...,...,...,...,...,...,...,...,...
3746,1,50,1.0,0.0,313.0,179.0,92.0,25.97,66.0,86.0
3747,1,51,43.0,0.0,207.0,126.5,80.0,19.71,65.0,68.0
3748,0,52,0.0,0.0,269.0,133.5,83.0,21.47,80.0,107.0
3749,1,40,0.0,0.0,185.0,141.0,98.0,25.60,67.0,72.0


In [13]:
# checking for imbalanced dataset
y.value_counts()

Risk
0    2581
1    1170
Name: count, dtype: int64

In [14]:
# oversampling
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(random_state = 42)
X, y = oversampler.fit_resample(X,y)

# y.value_counts()

In [15]:
# splitting data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from imblearn.under_sampling import RandomUnderSampler

In [17]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [18]:
y_pred = rf.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy : ",accuracy_score(y_pred,y_test))
print('Classification report : ',classification_report(y_pred,y_test))

Accuracy :  0.9477250726040658
Classification report :                precision    recall  f1-score   support

           0       0.91      0.98      0.94       470
           1       0.98      0.92      0.95       563

    accuracy                           0.95      1033
   macro avg       0.95      0.95      0.95      1033
weighted avg       0.95      0.95      0.95      1033



In [20]:
import pickle
with open('hypertensionfull.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [21]:
def predict1(data):
    data = pd.DataFrame(data)

    prediction = rf.predict_proba(data)
    print(prediction)
    

In [22]:
# Dictionary with values in lists
data = {
    'male': 0,
    'age': 38,
    'cigsPerDay': 20,
    'BPMeds': 0,
    'totChol': 221,
    'sysBP': 140,
    'diaBP': 90,
    'BMI': 22.37,
    'heartRate': 64,
    'glucose': 72
}

predict1([data])

[[0.14 0.86]]


In [23]:
y2 = df['Risk']
X2 = df.drop(columns=['BPMeds','totChol','sysBP','diaBP','glucose','Risk'], axis = 1)
X2

Unnamed: 0,male,age,cigsPerDay,BMI,heartRate
0,1,39,0.0,26.97,80.0
1,0,46,0.0,28.73,95.0
2,1,48,20.0,25.34,75.0
3,0,61,30.0,28.58,65.0
4,0,46,23.0,23.10,85.0
...,...,...,...,...,...
3746,1,50,1.0,25.97,66.0
3747,1,51,43.0,19.71,65.0
3748,0,52,0.0,21.47,80.0
3749,1,40,0.0,25.60,67.0


In [24]:

oversampler = RandomOverSampler(random_state = 42)
X2, y2 = oversampler.fit_resample(X2,y2)


In [25]:

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [26]:
rf2 = RandomForestClassifier()
rf2.fit(X_train2,y_train2)

In [27]:
y_pred2 = rf2.predict(X_test2)

In [28]:
print("Accuracy : ",accuracy_score(y_pred2,y_test2))
print('Classification report : ',classification_report(y_pred2,y_test2))

Accuracy :  0.8325266214908035
Classification report :                precision    recall  f1-score   support

           0       0.77      0.88      0.82       443
           1       0.90      0.80      0.84       590

    accuracy                           0.83      1033
   macro avg       0.83      0.84      0.83      1033
weighted avg       0.84      0.83      0.83      1033



In [34]:
def predict2(data):
    data = pd.DataFrame(data)

    prediction = rf2.predict(data)
    print(prediction)
    

In [37]:
data = {
    'male': 1,
    'age': 61,
    'cigsPerDay': 30,
    'BMI': 26.97,
    'heartRate': 60,
}

predict2([data])

[1]


In [33]:
with open('hypertensionhalf.pkl', 'wb') as file:
    pickle.dump(rf2, file)