In [1]:
import pandas as pd

In [2]:
heart = pd.read_csv('heart.csv')
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
heart.shape

(303, 14)

In [4]:
target = heart['target']
target.value_counts()

1    165
0    138
Name: target, dtype: int64

In [5]:
data = heart.drop('target', axis=1)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [6]:
data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64

In [7]:
data['thal'].value_counts()

2    166
3    117
1     18
0      2
Name: thal, dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
dtypes: float64(1), int64(12)
memory usage: 30.9 KB


In [9]:
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [10]:
numerical_columns = ['age','trestbps','chol','thalach','oldpeak']
categorical_columns = ['sex','cp','fbs','restecg','exang','slope','ca','thal']

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_preprocessor = StandardScaler()
cat_preprocessor = OneHotEncoder(handle_unknown='ignore')

In [12]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
                     ('standard_scaler',num_preprocessor,numerical_columns),
                     ('one_hot_encoder',cat_preprocessor,categorical_columns)
])


In [13]:
from sklearn import set_config
set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

In [14]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model,data,target,cv=10)
accuracy = scores.mean()
accuracy

0.8511827956989247

In [15]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

cat2_preprocessor = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
preprocessor2 = ColumnTransformer([
                     ('ordinal_encoder', cat2_preprocessor, categorical_columns)], remainder='passthrough')

model2 = Pipeline([
            ('preprocessor',preprocessor2),
            ('classifier',RandomForestClassifier(random_state=0))
])
scores2 = cross_val_score(model2, data, target, cv=10)
accuracy2 = scores2.mean()
accuracy2

0.8512903225806452

In [17]:
import numpy as np

n_estimators = [int(x)for x in np.linspace(start=100,stop=1200,num=12)]
# no of features to consider at every split
max_features = ['auto', 'sqrt']
# max number of levelse in tree
max_depth = [int(x) for x in np.linspace(5,30,num=6)]
# max_depth.append(None)
# min number of samples required to split a node
min_samples_split = [2,5,10,15,100]
# min number of samples required at each leaf node
min_samples_leaf = [1,2,5,10]

In [18]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = {'classifier__n_estimators':n_estimators,
               'classifier__max_features':max_features,
               'classifier__max_depth':max_depth,
               'classifier__min_samples_split':min_samples_split,
               'classifier__min_samples_leaf':min_samples_leaf}
print(random_grid)

{'classifier__n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'classifier__max_features': ['auto', 'sqrt'], 'classifier__max_depth': [5, 10, 15, 20, 25, 30], 'classifier__min_samples_split': [2, 5, 10, 15, 100], 'classifier__min_samples_leaf': [1, 2, 5, 10]}


In [19]:
rf_random = RandomizedSearchCV(model2,param_distributions=random_grid,n_iter=20,cv=5,verbose=1)
cv_results = cross_val_score(rf_random, data, target, cv=10, n_jobs=2)
cv_results
accuracy3 = cv_results.mean()
accuracy3

0.8381720430107528

In [20]:
from sklearn.model_selection import train_test_split

data_train,data_test,target_train,target_test = train_test_split(data,target,test_size=0.2,random_state=0)

In [21]:
!pip install xgboost



In [48]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay,confusion_matrix
model3 = Pipeline([
             ('preprocesor', preprocessor2),
             ('xgb', XGBClassifier(verbosity=0,random_state=0,n_jobs=2,use_label_encoder=False))
              ])

model3.fit(data_train, target_train)
preds = model3.predict(data_test)
accuracy = accuracy_score(target_test, preds)
accuracy

0.7868852459016393

In [46]:
_ = ConfusionMatrixDisplay.from_estimator(model3,data_test,target_test)
plt.show()

AttributeError: type object 'ConfusionMatrixDisplay' has no attribute 'from_estimator'

In [58]:
params = {'xgb__n_estimators': range(100,1000,100),
          'xgb__max_depth':range(3,10),
          'xgb__learning_rate':[0.01,0.03,0.05,0.070,0.9,0.2],
          'xgb__subsample':[0.5,0.6,0.7,0.8,0.9]}
xg_rand = RandomizedSearchCV(model3,param_distributions=params,n_iter=20,cv=7,verbose=1)
xg_rand.fit(data_train,target_train)
preds = xg_rand.predict(data_test)
accuracy = accuracy_score(target_test, preds)
accuracy

Fitting 7 folds for each of 20 candidates, totalling 140 fits


0.819672131147541

In [59]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(target_test, preds))
print(confusion_matrix(target_test, preds))
print(classification_report(target_test, preds))

0.819672131147541
[[20  7]
 [ 4 30]]
              precision    recall  f1-score   support

           0       0.83      0.74      0.78        27
           1       0.81      0.88      0.85        34

    accuracy                           0.82        61
   macro avg       0.82      0.81      0.81        61
weighted avg       0.82      0.82      0.82        61



In [60]:
best = xg_rand.best_estimator_

In [65]:
import joblib
joblib.dump(best,'xgb_model.pkl')

['xgb_model.pkl']

In [66]:
file = open('xgb_model.pkl','rb')

pkl_model = joblib.load(file)
prediction = pkl_model.predict(data_test)
print(prediction)

[0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 1 0 1 1 1 1 0
 1 0 0 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1]
