In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets

# metadata
print(heart_disease.metadata)

print()

# variable information
print(heart_disease.variables)



{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

In [4]:
print(heart_disease.variables.keys())

Index(['name', 'role', 'type', 'demographic', 'description', 'units',
       'missing_values'],
      dtype='object')


In [5]:
print(heart_disease.variables['name'])

0          age
1          sex
2           cp
3     trestbps
4         chol
5          fbs
6      restecg
7      thalach
8        exang
9      oldpeak
10       slope
11          ca
12        thal
13         num
Name: name, dtype: object


In [6]:
print(heart_disease.metadata.keys())
print()
print(heart_disease.metadata['target_col'])

dict_keys(['uci_id', 'name', 'repository_url', 'data_url', 'abstract', 'area', 'tasks', 'characteristics', 'num_instances', 'num_features', 'feature_types', 'demographics', 'target_col', 'index_col', 'has_missing_values', 'missing_values_symbol', 'year_of_dataset_creation', 'last_updated', 'dataset_doi', 'creators', 'intro_paper', 'additional_info'])

['num']


In [7]:
print(heart_disease.data.features.keys())

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')


In [8]:
print(heart_disease.data.targets.keys())

Index(['num'], dtype='object')


In [9]:
print(heart_disease.data.targets)

     num
0      0
1      2
2      1
3      0
4      0
..   ...
298    1
299    2
300    3
301    1
302    0

[303 rows x 1 columns]


In [11]:
target = heart_disease.data.targets

target = target['num'].apply(lambda x: 1 if x > 0 else 0) # convert to either 0 for no heart disease or 1 for heart disease

print(target)

0      0
1      1
2      1
3      0
4      0
      ..
298    1
299    1
300    1
301    1
302    0
Name: num, Length: 303, dtype: int64


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

model = HistGradientBoostingClassifier()

model.fit(X_train,y_train)

y_preds = model.predict(X_test)

print(accuracy_score(y_test, y_preds))

0.9016393442622951


In [21]:
params = [{
    "learning_rate": [0.05,0.1,0.25,0.4],
    "max_iter": [50,100,250]
}]

gridsearch = GridSearchCV(estimator=model, param_grid=params, cv=10)

gridsearch.fit(X_train,y_train)

print("best parameters: ", gridsearch.best_params_)

print("best estimator: ", gridsearch.best_estimator_)

print("best score: ", gridsearch.best_score_)

best parameters:  {'learning_rate': 0.05, 'max_iter': 250}
best estimator:  HistGradientBoostingClassifier(learning_rate=0.05, max_iter=250)
best score:  0.7985


In [43]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


#X_train, X_test, y_train, y_test = train_test_split(X, heart_disease.data.targets['num'], test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

model = HistGradientBoostingClassifier()

model.fit(X_train,y_train)

y_preds = model.predict(X_test)

print("accuracy score for base HistGradientBoostingClassifier: ", accuracy_score(y_test, y_preds))

print("classification report for base HistGradientBoostingClassifier: ", "\n",classification_report(y_test,y_preds))

print("confusion matrix for base HistGradientBoostingClassifier: ", confusion_matrix(y_test,y_preds))

xgb_model = XGBClassifier(
    objective='binary:logistic',
    learning_rate=0.05,
    n_estimators=250,
    random_state=42,
    eval_metric='logloss'
)

params = [{
    "learning_rate": [0.01,0.05,0.1,0.25],
    "n_estimators": [50,100,250],
    "max_depth": [1,2,4,8,10]
}]

gridsearch = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10)

gridsearch.fit(X_train,y_train)

print("best parameters: ", gridsearch.best_params_)

print("best estimator: ", gridsearch.best_estimator_)

print("best score: ", gridsearch.best_score_)

accuracy score for base HistGradientBoostingClassifier:  0.9016393442622951
classification report for base HistGradientBoostingClassifier:  
               precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

confusion matrix for base HistGradientBoostingClassifier:  [[27  2]
 [ 4 28]]
best parameters:  {'learning_rate': 0.25, 'max_depth': 1, 'n_estimators': 100}
best estimator:  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=

In [54]:
from sklearn.impute import SimpleImputer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

standard_scaler = StandardScaler()
X_scaled = standard_scaler.fit_transform(X_imputed)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, target, test_size=0.2, random_state=42)

model = LogisticRegression(random_state=42)

model.fit(X_train,y_train)

y_preds = model.predict(X_test)

print("accuracy score for base HistGradientBoostingClassifier: ", accuracy_score(y_test, y_preds))

print("classification report for base HistGradientBoostingClassifier: ", "\n",classification_report(y_test,y_preds))

print("confusion matrix for base HistGradientBoostingClassifier: ", confusion_matrix(y_test,y_preds))

print("\n")
print("\n")

# params = [{
#     "penalty": ['l1','l2'],
#     "C": [0.001,0.01,0.1,1,10],

# }]

# gridsearch = GridSearchCV(estimator=LogisticRegression(solver='saga',random_state=42), param_grid=params, cv=10)

# gridsearch.fit(X_train,y_train)

# print("best parameters: ", gridsearch.best_params_)

# print("best estimator: ", gridsearch.best_estimator_)

# print("best score: ", gridsearch.best_score_)

accuracy score for base HistGradientBoostingClassifier:  0.8852459016393442
classification report for base HistGradientBoostingClassifier:  
               precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61

confusion matrix for base HistGradientBoostingClassifier:  [[25  4]
 [ 3 29]]






STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Pipeline for Model Using HistGradientBoostingClassifier with GridSearch

In [58]:
# creating a pipeline for model

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
#from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

model = HistGradientBoostingClassifier()

model.fit(X_train,y_train)

y_preds = model.predict(X_test)

print("accuracy score for base HistGradientBoostingClassifier: ", accuracy_score(y_test, y_preds))

print("classification report for base HistGradientBoostingClassifier: ", "\n",classification_report(y_test,y_preds))

print("confusion matrix for base HistGradientBoostingClassifier: ", confusion_matrix(y_test,y_preds))


heart_disease_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])


params = {
    'model__C': [0.01,0.05,0.1,1]
}

gridsearch = GridSearchCV(heart_disease_pipeline, param_grid=params, cv=10)
gridsearch.fit(X,target)

print("Best Score:", gridsearch.best_score_)

print("Best Parameters:", gridsearch.best_params_)

accuracy score for base HistGradientBoostingClassifier:  0.9016393442622951
classification report for base HistGradientBoostingClassifier:  
               precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

confusion matrix for base HistGradientBoostingClassifier:  [[27  2]
 [ 4 28]]
Best Score: 0.8478494623655914
Best Parameters: {'model__C': 0.01}


In [64]:
# creating a pipeline for model HistGradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
#from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

model = HistGradientBoostingClassifier()

model.fit(X_train,y_train)

y_preds = model.predict(X_test)

print("accuracy score for base HistGradientBoostingClassifier: ", accuracy_score(y_test, y_preds))

print("classification report for base HistGradientBoostingClassifier: ", "\n",classification_report(y_test,y_preds))

print("confusion matrix for base HistGradientBoostingClassifier: ", confusion_matrix(y_test,y_preds))


heart_disease_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', HistGradientBoostingClassifier())
])


params = {
    'model__learning_rate': [0.01,0.05,0.1,1],
    'model__l2_regularization': [0.01,0.05,0.1,0.2]
}

gridsearch = GridSearchCV(heart_disease_pipeline, param_grid=params, cv=10)
gridsearch.fit(X,target)

print("Best Score:", gridsearch.best_score_)

print("Best Parameters:", gridsearch.best_params_)

accuracy score for base HistGradientBoostingClassifier:  0.9016393442622951
classification report for base HistGradientBoostingClassifier:  
               precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

confusion matrix for base HistGradientBoostingClassifier:  [[27  2]
 [ 4 28]]
Best Score: 0.8246236559139785
Best Parameters: {'model__l2_regularization': 0.01, 'model__learning_rate': 0.05}


In [70]:
# hgb model
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

hgb_model = HistGradientBoostingClassifier(learning_rate=0.05, l2_regularization=0.01)

hgb_model.fit(X_train,y_train)

y_preds = hgb_model.predict(X_test)

print("accuracy score for HistGradientBoostingClassifier: ", accuracy_score(y_test, y_preds))

print("classification report for HistGradientBoostingClassifier: ", "\n",classification_report(y_test,y_preds))

print("confusion matrix for HistGradientBoostingClassifier: ", "\n", confusion_matrix(y_test,y_preds))

accuracy score for HistGradientBoostingClassifier:  0.9016393442622951
classification report for HistGradientBoostingClassifier:  
               precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

confusion matrix for HistGradientBoostingClassifier:  
 [[27  2]
 [ 4 28]]


# Pipeline for Model Using HistGradientBoostingClassifier

In [72]:
# creating a pipeline for model HistGradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
#from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

heart_disease_pipeline = Pipeline([
    ('model', HistGradientBoostingClassifier(learning_rate=0.05, l2_regularization=0.01))
])

heart_disease_pipeline.fit(X_train,y_train)

y_preds = heart_disease_pipeline.predict(X_test)

print("Accuracy Score for Heart Disease Pipeline using HistGradientBoostingClassifier: ", accuracy_score(y_test, y_preds))

print("Classification Report for Heart Disease Pipeline using HistGradientBoostingClassifier: ", "\n", classification_report(y_test,y_preds))

Accuracy Score for Heart Disease Pipeline using HistGradientBoostingClassifier:  0.9016393442622951
Classification Report for Heart Disease Pipeline using HistGradientBoostingClassifier:  
               precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61



# Pipeline for Model Using RandomForestClassifier with GridSearch

In [78]:
# creating a pipeline for model RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

heart_disease_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])


params = {
    'model__n_estimators': [50,100,250,500],
    'model__criterion': ['gini','entropy','log_loss'],
    'model__max_depth': [1,2,4,8]
}

gridsearch = GridSearchCV(heart_disease_pipeline, param_grid=params, cv=10)
gridsearch.fit(X,target)

print("Best Score:", gridsearch.best_score_)

print("Best Parameters:", gridsearch.best_params_)

Best Score: 0.8513978494623655
Best Parameters: {'model__criterion': 'entropy', 'model__max_depth': 1, 'model__n_estimators': 500}


In [83]:
# creating a pipeline for model RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

heart_disease_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])


params = [{
    'model__criterion': 'entropy',
    'model__max_depth': 1,
    'model__n_estimators': 500
}]


heart_disease_pipeline.fit(X_train,y_train)

y_preds = heart_disease_pipeline.predict(X_test)

print("Accuracy Score for Heart Disease Pipeline using RandomForestClassifier: ", accuracy_score(y_test, y_preds))

print("Classification Report for Heart Disease Pipeline using RandomForestClassifier: ", "\n", classification_report(y_test,y_preds))

Accuracy Score for Heart Disease Pipeline using RandomForestClassifier:  0.8852459016393442
Classification Report for Heart Disease Pipeline using RandomForestClassifier:  
               precision    recall  f1-score   support

           0       0.84      0.93      0.89        29
           1       0.93      0.84      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61



# Pipeline for Model using XGBClassifier with GridSearch (NOT binary)

In [85]:
# creating a pipeline for model XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

target = heart_disease.data.targets['num']

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

heart_disease_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', XGBClassifier())
])


params = {
    'model__learning_rate': [0.01,0.05,0.1,0.3,1],
    'model__gamma': [0,0.05,0.1,0.2],
    'model__objective': ['multi:softmax'],
    'model__num_class': [5]
}

gridsearch = GridSearchCV(heart_disease_pipeline, param_grid=params, cv=10)
gridsearch.fit(X,target)

print("Best Score:", gridsearch.best_score_)

print("Best Parameters:", gridsearch.best_params_)

Best Score: 0.5741935483870967
Best Parameters: {'model__gamma': 0.2, 'model__learning_rate': 0.3, 'model__num_class': 5, 'model__objective': 'multi:softmax'}


# Random Forest Classifier with Balanced Class Weight and Multi-Class Classification

In [87]:
# creating a pipeline for model RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

target = heart_disease.data.targets['num']

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

heart_disease_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(class_weight='balanced'))
])


params = {
    'model__n_estimators': [50,100,250,500],
    'model__criterion': ['gini','entropy','log_loss'],
    'model__max_depth': [1,2,4,8]
}

gridsearch = GridSearchCV(heart_disease_pipeline, param_grid=params, cv=10)
gridsearch.fit(X,target)

print("Best Score:", gridsearch.best_score_)

print("Best Parameters:", gridsearch.best_params_)

Best Score: 0.5938709677419354
Best Parameters: {'model__criterion': 'gini', 'model__max_depth': 4, 'model__n_estimators': 100}
