In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('Data.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.shape


(3857, 26)

In [3]:
df.head()

Unnamed: 0,record,duration,creatinines,bloodsugarpp,hba1c,cholesterols,cholesterolhdl,cholesterolldl,triglycerides,cholesterolvldl,...,familyhistory,smoking,exercise,polycysticovary,vascularissue,heartissue,centralnervoussystemissue,diabeticretinopathyissue,kidneyissue,nerveissue
0,1,5,2.2,147,6,162,43,132,163,45,...,N,Y,N,N,N,Y,N,N,N,N
1,2,14,1.8,163,8,229,36,185,174,62,...,N,N,N,Y,N,N,N,N,Y,N
2,3,11,1.6,179,8,282,48,173,151,58,...,N,N,N,X,N,N,N,N,N,N
3,4,12,0.8,171,8,241,56,139,155,25,...,Y,N,N,X,N,Y,N,N,N,Y
4,5,4,5.2,225,8,333,68,145,117,22,...,Y,Y,N,X,N,Y,N,N,N,N


Stage: Cleanup and EDA

In [4]:
df.isnull().sum()

record                       0
duration                     0
creatinines                  0
bloodsugarpp                 0
hba1c                        0
cholesterols                 0
cholesterolhdl               0
cholesterolldl               0
triglycerides                0
cholesterolvldl              0
age                          0
bmi                          0
diastolicbp                  0
systolicbp                   0
gestation                    0
sex                          0
familyhistory                0
smoking                      0
exercise                     0
polycysticovary              0
vascularissue                0
heartissue                   0
centralnervoussystemissue    0
diabeticretinopathyissue     0
kidneyissue                  0
nerveissue                   0
dtype: int64

No NaN values seen!

In [5]:
df.drop(['record'],inplace=True,axis=1)
df.heartissue = (df.heartissue == 'Y').astype(int)


See Distribution

In [6]:
print(df['heartissue'].describe())


count    3857.000000
mean        0.304900
std         0.460425
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: heartissue, dtype: float64


Feature importnace

In [7]:
df.columns


Index(['duration', 'creatinines', 'bloodsugarpp', 'hba1c', 'cholesterols',
       'cholesterolhdl', 'cholesterolldl', 'triglycerides', 'cholesterolvldl',
       'age', 'bmi', 'diastolicbp', 'systolicbp', 'gestation', 'sex',
       'familyhistory', 'smoking', 'exercise', 'polycysticovary',
       'vascularissue', 'heartissue', 'centralnervoussystemissue',
       'diabeticretinopathyissue', 'kidneyissue', 'nerveissue'],
      dtype='object')

In [8]:
categorical_col = df.select_dtypes(include=['object']).columns
print(categorical_col)

Index(['sex', 'familyhistory', 'smoking', 'exercise', 'polycysticovary',
       'vascularissue', 'centralnervoussystemissue',
       'diabeticretinopathyissue', 'kidneyissue', 'nerveissue'],
      dtype='object')


In [9]:
numerical_col = df.select_dtypes(exclude=['object']).columns
print(numerical_col)

Index(['duration', 'creatinines', 'bloodsugarpp', 'hba1c', 'cholesterols',
       'cholesterolhdl', 'cholesterolldl', 'triglycerides', 'cholesterolvldl',
       'age', 'bmi', 'diastolicbp', 'systolicbp', 'gestation', 'heartissue'],
      dtype='object')


In [10]:

from sklearn.metrics import mutual_info_score
def mutual_info_heartissue_score(series):
    return mutual_info_score(series, df.heartissue)
mi = df[categorical_col].apply(mutual_info_heartissue_score)
mi.sort_values(ascending=False).round(6)

smoking                      0.000496
kidneyissue                  0.000234
familyhistory                0.000176
vascularissue                0.000115
centralnervoussystemissue    0.000076
diabeticretinopathyissue     0.000075
polycysticovary              0.000044
sex                          0.000032
exercise                     0.000001
nerveissue                   0.000000
dtype: float64

In [11]:
numerical_col = numerical_col.drop("heartissue")
df[numerical_col].corrwith(df.heartissue).abs().sort_values(ascending=False)


cholesterols       0.029380
triglycerides      0.012522
gestation          0.011503
age                0.011084
bloodsugarpp       0.008104
creatinines        0.007298
hba1c              0.006131
cholesterolhdl     0.005654
systolicbp         0.005278
bmi                0.005112
cholesterolldl     0.004913
cholesterolvldl    0.004212
duration           0.000365
diastolicbp        0.000278
dtype: float64

In [12]:
#joining_year = ['joining_year']
#df[joining_year].corrwith(df.stars).abs()

Stage: Model Training 

Split data into train validation and test

In [13]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train), len(df_val), len(df_test)

(2313, 772, 772)

In [14]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.heartissue.values
y_val = df_val.heartissue.values
y_test = df_test.heartissue.values

del df_train['heartissue']
del df_val['heartissue']
del df_test['heartissue']

One-hot encoding

In [15]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)




#train_dict = df_train[categorical_col + numerical_col].to_dict(orient='records')
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_val.to_dict(orient='records')
X_test = dv.transform(test_dict)
#dv.feature_names_

Training logistic regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
#model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
#y_pred = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)
#heart_issue_decision = (y_pred >= 0.5)
#(y_val == heart_issue_decision).mean()
print(accuracy_score(y_val, y_pred))
y_pred = model.predict_proba(X_val)[:, 1]
heartissue_presence = (y_pred >= 0.5)
print((y_val == heartissue_presence).mean())
ras = roc_auc_score(y_val, y_pred)
print(ras)



0.7098445595854922
0.7098445595854922
0.48317746350364965


In [17]:
def train(df_train, y_train):
    dicts = df_train.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    return dv, model

def predict(df, dv, model):
    dicts = df.to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict(X)

    return y_pred

In [18]:
dv, model = train(df_train, y_train)
y_pred =  predict(df_val, dv, model)
print(accuracy_score(y_val, y_pred))

0.7098445595854922


In [21]:
from sklearn.model_selection import KFold
n_splits = [3, 5, 10, 15, 20, 100, 200, 250]
for n_split in n_splits: 
    kfold = KFold(n_splits=n_split, shuffle=True, random_state=1)
    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]

        y_train = df_train.heartissue.values
        y_val = df_val.heartissue.values

        dv, model = train(df_train, y_train)
        y_pred = predict(df_val, dv, model)

    print(accuracy_score(y_val, y_pred))
    print(roc_auc_score(y_val, y_pred))

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [20]:
df['stars'].describe()

KeyError: 'stars'

Training random forest regressor


In [184]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False).round(3)

27.375

In [185]:
# scores = []
# for n in range(10, 201, 10):
#     rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
#     rf.fit(X_train, y_train)

#     y_pred = rf.predict(X_val)
#     rmse = mean_squared_error(y_val, y_pred, squared=False).round(3)
    
#     scores.append((n, rmse))

In [186]:
# print(scores)

In [187]:
scores = []
for d in [10, 5, 3, 1]:    
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False).round(3)
        
        scores.append((n, rmse))
        df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])
    print(df_scores['rmse'].mean())

68.34585
68.72695
69.01266666666668
69.64236249999999


XGBoost

In [188]:
import xgboost as xgb
features = list(dv.get_feature_names_out())
print(len(features))
for feature in features:
    if "[" in feature or "]" in feature or "<" in feature:
        print(feature)
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

5567


In [189]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [190]:
y_pred = model.predict(dval)
mean_squared_error(y_val, y_pred, squared=False).round(3)

69.05