In [2]:
import pandas as pd
import numpy as np

In [3]:
columns = pd.read_csv("Echocardiogram.names", sep="\n", header=None)[0]
columns

0                  survival
1               still-alive
2       age-at-heart-attack
3      pericardial-effusion
4     fractional-shortening
5                      epss
6                      lvdd
7         wall-motion-score
8         wall-motion-index
9                      mult
10                     name
11                    group
12               alive-at-1
Name: 0, dtype: object

In [4]:
columns = pd.read_csv("Echocardiogram.names", sep="\n", header=None)[0].tolist()
echo_data_df = pd.read_csv("Echocardiogram.data", names=columns)
print("Data len:", len(echo_data_df))
print("Attribute count:", len(echo_data_df.columns))
echo_data_df.head()

Data len: 132
Attribute count: 13


Unnamed: 0,survival,still-alive,age-at-heart-attack,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-score,wall-motion-index,mult,name,group,alive-at-1
0,11,0,71,0,0.26,9.0,4.6,14,1.0,1.0,name,1,0
1,19,0,72,0,0.38,6.0,4.1,14,1.7,0.588,name,1,0
2,16,0,55,0,0.26,4.0,3.42,14,1.0,1.0,name,1,0
3,57,0,60,0,0.253,12.062,4.603,16,1.45,0.788,name,1,0
4,19,1,57,0,0.16,22.0,5.75,18,2.25,0.571,name,1,0


In [5]:
echo_data_df['alive-at-1'].value_counts()

?    57
0    50
1    24
2     1
Name: alive-at-1, dtype: int64

In [6]:
echo_data_df=echo_data_df.rename(columns={'alive-at-1': 'alive_one'})
echo_data_df=echo_data_df.rename(columns={'still-alive': 'stillalive'})
echo_data_df = echo_data_df[echo_data_df['alive_one'] != '2']
echo_data_df['alive_one'] = echo_data_df['alive_one'].replace(to_replace='?', method='ffill')
echo_data_df['alive_one'] = echo_data_df['alive_one'].astype(int)

In [7]:
echo_data_df['alive_one'].value_counts()

0    74
1    57
Name: alive_one, dtype: int64

In [8]:
echo_data_df['stillalive'] = echo_data_df['stillalive'].replace(to_replace='?', method='ffill')
echo_data_df['stillalive'] = echo_data_df['stillalive'].astype(int)

In [9]:
X_basic = echo_data_df.replace(to_replace='?', method='ffill')
X_basic['survival'] = X_basic['survival'].replace(to_replace='?', method='ffill')
X_basic['survival'] = X_basic['survival'].astype(float)
X_basic['alive_one'] = np.where(X_basic['survival'] > 12, 1,0)

In [10]:
X_basic['survival'].isnull().sum()

0

In [11]:
X = X_basic.drop(['name' , 'wall-motion-score','alive_one','mult','group'], axis=1)
X.head()

Unnamed: 0,survival,stillalive,age-at-heart-attack,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-index
0,11.0,0,71,0,0.26,9.0,4.6,1.0
1,19.0,0,72,0,0.38,6.0,4.1,1.7
2,16.0,0,55,0,0.26,4.0,3.42,1.0
3,57.0,0,60,0,0.253,12.062,4.603,1.45
4,19.0,1,57,0,0.16,22.0,5.75,2.25


In [12]:
y = X_basic[["alive_one"]]
y.head()

Unnamed: 0,alive_one
0,0
1,1
2,1
3,1
4,1


In [13]:
cardio_type_dict = {0:"dead", 1:"alive"}

df_stats = y.alive_one.value_counts().rename_axis('cardio_type').reset_index(name='count')
df_stats["type_name"] = df_stats["cardio_type"].map(cardio_type_dict)
df_stats

Unnamed: 0,cardio_type,count,type_name
0,1,89,alive
1,0,42,dead


In [14]:
from sklearn.tree import DecisionTreeClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(0.25), 
                                                    random_state = 42, shuffle=True, stratify=y)

In [16]:
classifier = DecisionTreeClassifier(random_state=42)
parameters = {"max_depth":[None, 2]}
scoring = ['f1_weighted']
grid2 = GridSearchCV(estimator = classifier, param_grid = parameters, cv=2, 
                     n_jobs=-1, verbose=10, refit='f1_weighted', scoring = scoring)

grid2 = grid2.fit(X_train, y_train)
print(grid2)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=(15/100), 
                                                            random_state = 42, shuffle=True, stratify = y)


ncv_result = cross_validate(estimator=grid2, X=X_train_val, y=y_train_val, cv=2, 
                           n_jobs=-1, verbose=10, scoring = scoring, return_estimator=True)

print("Best Score: ", grid2.best_score_)
print("Best Estimator: ", grid2.best_estimator_)
print("Best Parameters: ", grid2.best_params_)
print("Tuning cv results: ")
pd.set_option('display.max_colwidth', None)
display(pd.DataFrame(data=grid2.cv_results_, columns=grid2.cv_results_.keys()))
print("\n")


print("Evaluation ncv_result: -")
pd.set_option('display.max_colwidth', None)
display(pd.DataFrame(data=ncv_result, columns=ncv_result.keys()).transpose())


y_pred = grid2.best_estimator_.predict(X_test)

target_names = ["dead", "alive"]

print("Classification Report (Testing Results)")
print(classification_report(y_test, y_pred, target_names = target_names))
print("\n")

Fitting 2 folds for each of 2 candidates, totalling 4 fits
GridSearchCV(cv=2, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 2]}, refit='f1_weighted',
             scoring=['f1_weighted'], verbose=10)
Best Score:  1.0
Best Estimator:  DecisionTreeClassifier(random_state=42)
Best Parameters:  {'max_depth': None}
Tuning cv results: 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1022s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.0s finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_f1_weighted,split1_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted
0,0.012993,0.001,0.007995,0.001999,,{'max_depth': None},1.0,1.0,1.0,0.0,1
1,0.015496,0.000495,0.006988,8e-06,2.0,{'max_depth': 2},1.0,1.0,1.0,0.0,1




Evaluation ncv_result: -


Unnamed: 0,0,1
fit_time,0.089803,0.092831
score_time,0.003997,0.004997
estimator,"GridSearchCV(cv=2, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,\n param_grid={'max_depth': [None, 2]}, refit='f1_weighted',\n scoring=['f1_weighted'], verbose=10)","GridSearchCV(cv=2, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,\n param_grid={'max_depth': [None, 2]}, refit='f1_weighted',\n scoring=['f1_weighted'], verbose=10)"
test_f1_weighted,0.982265,1.0


Classification Report (Testing Results)
              precision    recall  f1-score   support

        dead       0.86      1.00      0.92         6
       alive       1.00      0.93      0.96        14

    accuracy                           0.95        20
   macro avg       0.93      0.96      0.94        20
weighted avg       0.96      0.95      0.95        20





In [17]:
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print('\n')

print("Pretty Confusion Matrix:")
display(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=np.unique(y_test)),
                         index=['pred:' + str(x) for x in np.unique(y_test)],
                         columns=['true:' + str(x) for x in np.unique(y_test)]))

print("Classification Report")
target_names = ["dead","alive"]
print(classification_report(y_test, y_pred, target_names = target_names))

Confusion Matrix
[[ 6  0]
 [ 1 13]]


Pretty Confusion Matrix:


Unnamed: 0,true:0,true:1
pred:0,6,0
pred:1,1,13


Classification Report
              precision    recall  f1-score   support

        dead       0.86      1.00      0.92         6
       alive       1.00      0.93      0.96        14

    accuracy                           0.95        20
   macro avg       0.93      0.96      0.94        20
weighted avg       0.96      0.95      0.95        20

