In [15]:
import pandas as pd
import numpy as np

In [16]:
new_dataset = pd.read_csv('dataset.csv')

In [17]:
new_dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [18]:
new_dataset.shape

(918, 12)

In [19]:
new_dataset.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [20]:
new_dataset['HeartDisease'].value_counts()

Unnamed: 0_level_0,count
HeartDisease,Unnamed: 1_level_1
1,508
0,410


In [30]:
def test_model(name, model, traindata, testdata, trainlabels, testlabels):
  model.fit(traindata, trainlabels)
  trainpred = model.predict(traindata)
  testpred = model.predict(testdata)
  print(f" ===== {name} =====")
  print(f"Accuracy on train {accuracy_score(trainlabels, trainpred)}")
  print(f"F1score on train {f1_score(trainlabels, trainpred)}")
  print(f"Accuracy on test {accuracy_score(testlabels, testpred)}")
  print(f"F1score on test {f1_score(testlabels, testpred)}")
  print("Confusion matrix on test")
  print(confusion_matrix(testlabels, testpred))

## 1

In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import MinMaxScaler

In [22]:
new_dataset = new_dataset.dropna()

le = LabelEncoder()
for column in new_dataset.columns:
    if new_dataset[column].dtype == object:
        new_dataset[column] = le.fit_transform(new_dataset[column])

X_train, X_test, Y_train, Y_test = train_test_split(new_dataset.drop('HeartDisease', axis=1), new_dataset['HeartDisease'], test_size=0.25, stratify=new_dataset['HeartDisease'], random_state=42)

## 2

In [24]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer

In [26]:
coltran = ColumnTransformer(transformers=[("discr", KBinsDiscretizer(n_bins=5), ["Age", "RestingBP", "Cholesterol", "MaxHR"])],
                                 remainder='passthrough')

pipeline = Pipeline(steps=[("coltran", coltran),
                           ("estimator", DecisionTreeClassifier())])

pipeline.fit(X_train, Y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [28]:
Y_pred = pipeline.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.8043478260869565

## 3

In [33]:
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('coltran', ColumnTransformer([
        ('discr', KBinsDiscretizer(n_bins=5), ['Age', 'RestingBP', 'Cholesterol', 'MaxHR'])
    ], remainder='passthrough')),
    ('feature_selection', SelectKBest()),
    ('estimator', DecisionTreeClassifier())
])

param_grid = {
    'feature_selection__k': [5, 10, 12],
    'coltran__discr__n_bins': [3, 5, 7],
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [None, 5, 10, 15, 20]
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [34]:
print("Migliori parametri:", grid_search.best_params_)
best_pipeline = grid_search.best_estimator_

Y_pred = best_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuratezza del modello Decision Tree con i migliori parametri e SelectKBest:", accuracy)

test_model("Pipeline with SelectKBest", best_pipeline, X_train, X_test, Y_train, Y_test)

Migliori parametri: {'coltran__discr__n_bins': 7, 'estimator__criterion': 'gini', 'estimator__max_depth': 5, 'feature_selection__k': 12}
Accuratezza del modello Decision Tree con i migliori parametri e SelectKBest: 0.8521739130434782
 ===== Pipeline with SelectKBest =====
Accuracy on train 0.8968023255813954
F1score on train 0.9076723016905072
Accuracy on test 0.8521739130434782
F1score on test 0.864
Confusion matrix on test
[[ 88  15]
 [ 19 108]]




## 4

In [35]:
from sklearn.decomposition import TruncatedSVD

def identity_func(X):
  return X

parameters = {
    'combined_features__svd__n_components': [2, 4, 6]
}

combined_features = FeatureUnion([("svd", TruncatedSVD()), ('passthrough', FunctionTransformer(identity_func,validate=False))])

coltran = ColumnTransformer(transformers=[("discr", KBinsDiscretizer(n_bins=5), ["Age", "RestingBP", "Cholesterol", "MaxHR"])],
                                 remainder='passthrough')

my_pipeline = Pipeline(steps=[("coltran", coltran),
                              ('combined_features', combined_features),
                              ("estimator", DecisionTreeClassifier(criterion='entropy', min_samples_leaf=5))
                             ], verbose = True)

gd = GridSearchCV(my_pipeline, parameters)
test_model("Scaled Extended Pipeline GD", gd, X_train, X_test, Y_train, Y_test)
gd.best_params_

[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing estimator, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing coltran, total=   0.0s
[Pipeline] . (step 2 of 3) Processing combined_features, total=   0.0s
[Pipel

{'combined_features__svd__n_components': 2}

## 5

In [36]:
from sklearn.preprocessing import Normalizer

pipeline = Pipeline([
    ('normalizer', Normalizer()),
    ('coltran', ColumnTransformer([
        ('minmax', MinMaxScaler(), ['Oldpeak'])
    ], remainder='passthrough')),
    ('estimator', DecisionTreeClassifier())
])

test_model("Pipeline with normalizer and minmaxscaler", pipeline, X_train, X_test, Y_train, Y_test)

ValueError: Specifying the columns using strings is only supported for dataframes.

In [38]:
from sklearn.preprocessing import Normalizer

pipeline = Pipeline([
    ('normalizer', Normalizer()),
    ('coltran', ColumnTransformer([
        ('minmax', MinMaxScaler(), X_train.columns.get_indexer(['Oldpeak']))
    ], remainder='passthrough')),
    ('estimator', DecisionTreeClassifier())
])

test_model("Pipeline with normalizer and minmaxscaler", pipeline, X_train, X_test, Y_train, Y_test)

 ===== Pipeline with normalizer and minmaxscaler =====
Accuracy on train 1.0
F1score on train 1.0
Accuracy on test 0.8260869565217391
F1score on test 0.8387096774193549
Confusion matrix on test
[[ 86  17]
 [ 23 104]]
