In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from pathlib import Path
import os
import pandas as pd
import seaborn as sns


In [None]:
col_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
    'ca', 'thal', 'num'
]

DATA = pd.read_csv(
    'Data/processed.cleveland.csv',
	encoding = 'latin1',
    names = col_names,
    header = None,
)

DATA.replace('?', pd.NA, inplace=True)

DATA['num'] = DATA['num'].apply(lambda x: 1 if x>0 else 0)

DATA


In [None]:
DATA.describe()


In [None]:
print(DATA.columns.values)


In [None]:
print("Missing Values per Column:")
print(DATA.isnull().sum())
print("------------------------------------")
print("Infinity Values per Column:")
print((DATA == np.inf).sum() + (DATA == -np.inf).sum())


In [None]:
df_clean = DATA.dropna()
df_clean


In [None]:
sns.pairplot(data = df_clean,vars=['age', 'chol', 'trestbps','thalach','num'])
plt.show()


In [None]:
selected_columns = ['age', 'chol', 'trestbps','thalach','num']
corr = DATA[selected_columns].corr()

# Vẽ heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='Blues', fmt=".2f", square=True)
plt.title('Heatmap of Feature Correlations')
plt.show()


In [None]:
import matplotlib.pyplot as plt
def histogram(variable):
    """
    input: variable ex:"Age"
    output: histogram & value count
    """
    # get feature
    var = DATA[variable]

    # count number of continuous variable
    varValue = var.value_counts()

    # visualize
    plt.figure(figsize=(8,3))
    plt.hist(var, bins=20, color='blue', alpha=0.7)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()

    print("{}: \n{}".format(variable, varValue))

def bar_plot(variable):
    """
    input: variable ex:"Sex"
    output: bar plot & value count
    """
    # get feature
    var = DATA[variable]

    # count number of categorical variable
    varValue = var.value_counts()

    # visualize
    plt.figure(figsize=(8,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()

    print("{}: \n{}".format(variable, varValue))


In [None]:
category1=['sex', 'fbs', 'exang','cp','restecg', 'slope', 'ca', 'thal', 'num']
for c in category1:
    bar_plot(c)


In [None]:
def plot_hist(variable):
    plt.figure(figsize=(8,3))
    plt.hist(DATA[variable],bins=50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} Distribituon with hist".format(variable))
    plt.show()


In [None]:
numericVar=['oldpeak', 'age', 'trestbps', 'chol', 'thalach']
for n in numericVar:
    plot_hist(n)


In [None]:
DATA[["sex","num"]]


In [None]:
DATA[["sex","num"]].groupby(["sex"],as_index = False).mean().sort_values(by='num', ascending=False)


In [None]:
import pandas as pd
import graphviz
import re
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from IPython.display import display, Markdown, Image
from IPython.display import Markdown, display as ds
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree


In [None]:
col_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
    'ca', 'thal', 'num'
]

df = pd.read_csv(
    r'Data/processed.cleveland.csv',
	encoding = 'latin1',
    names = col_names,
    header = None,
    na_values = '?'
)

y = df['num'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop('num', axis = 1)

df


In [None]:
df.dropna(
    subset=[
        'age','sex','cp','trestbps','chol','fbs',
        'restecg','thalach','exang','oldpeak','slope','ca','thal'
    ],
    inplace=True
)

df['num'] = df['num'].apply(lambda x: 1 if x>0 else 0)

df = pd.get_dummies(
    df,
    columns=['sex','cp','restecg','slope','thal'],
    drop_first=True
)

df


In [None]:
ratios = [
    (0.4, 0.6),
    (0.6, 0.4),
    (0.8, 0.2),
    (0.9, 0.1),
]

subsets = {}

for tr, te in ratios:
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, 
        train_size = tr, 
        stratify = y, 
        random_state = 42
    )
    key = f'{int(tr * 100)}/{int(te * 100)}'
    subsets[key] = {
        'X_train': X_tr, 'y_train': y_tr,
        'X_test' : X_te, 'y_test' : y_te,
    }

print("Finish to create subsets:", list(subsets.keys()))


In [None]:
clf_dict = {}

# Train models for different train/test splits
for key in ['40/60', '60/40', '80/20', '90/10']:
    data = subsets[key]
    clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
    clf.fit(data['X_train'], data['y_train'])
    clf_dict[key] = clf
    print(f"Trained model for split {key}")


In [None]:
# Evaluate all models
for key, clf in clf_dict.items():
    data = subsets[key]
    y_pred = clf.predict(data['X_test'])
    y_true = data['y_test']
    
    accuracy = accuracy_score(y_true, y_pred)
    misclassified_count = (y_pred != y_true).sum()
    total_samples = len(y_true)
    misclassified_rate = 100 * misclassified_count / total_samples
    
    print(f"=== Split {key} ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Numbers of Wrong labels: {misclassified_count}/{total_samples}")
    print(f"Ratio of wrong labels: {misclassified_rate:.2f}%")
    print()


In [None]:
key = '40/60'
data = subsets[key]

# Train Random Forest
clf_rf_40_60 = RandomForestClassifier(n_estimators=10, random_state=42)
clf_rf_40_60.fit(data['X_train'], data['y_train'])

# Predict
y_pred_rf = clf_rf_40_60.predict(data['X_test'])

# Confusion Matrix
cm_rf = confusion_matrix(data['y_test'], y_pred_rf)
print(f"Confusion Matrix (Random Forest - {key}):")
print(cm_rf)

# Classification Report
print(f"Classification Report (Random Forest - {key}):")
print(classification_report(data['y_test'], y_pred_rf))

# Calculate accuracy and misclassification rate
y_true = data['y_test']
accuracy = accuracy_score(y_true, y_pred_rf)
misclassified_count = (y_pred_rf != y_true).sum()
total_samples = len(y_true)
misclassified_rate = 100 * misclassified_count / total_samples

print(f"Numbers of Wrong labels: {misclassified_count}/{total_samples}")
print(f"Ratio of wrong labels: {misclassified_rate:.2f}%")
