In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

df = pd.read_csv('/kaggle/input/heart-prediction-dataset-quantum/Heart Prediction Quantum Dataset.csv')  

plt.figure(figsize=(12, 6))
df.hist(figsize=(10, 10), bins=20)
plt.suptitle("Feature Distributions")
plt.show()

sns.pairplot(df, hue="HeartDisease")
plt.show()

plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlations")
plt.show()

X = df.drop(columns=["HeartDisease", "QuantumPatternFeature"])
y = df["HeartDisease"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def train_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

print("\nLogistic Regression")
train_model(LogisticRegression(), X_train, y_train, X_test, y_test)

print("\nDecision Tree")
train_model(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

print("\nRandom Forest")
train_model(RandomForestClassifier(n_estimators=100), X_train, y_train, X_test, y_test)

print("\nSupport Vector Machine")
train_model(SVC(), X_train, y_train, X_test, y_test)



In [None]:
X.shape, y.shape, X_test.shape, X_train.shape

In [None]:
X.head()

In [None]:

print("\nNeural Network")
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=500, batch_size=32, verbose=1, validation_data=(X_test, y_test))
loss, acc = model.evaluate(X_test, y_test)
print(f"Neural Network Accuracy: {acc:.4f}")

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

# 1. згенеруйте нові фічі
df = X.copy()
df['AgeBucket'] = pd.cut(df['Age'], [0,40,55,65,120], labels=[0,1,2,3]).astype(int)
df['Chol_per_Age'] = df['Cholesterol'] / df['Age']
df['HR_pct_max'] = df['HeartRate'] / (220 - df['Age'])

# 2. визначте числові та категоріальні колонки
num_cols = ['Age', 'BloodPressure', 'Cholesterol', 'HeartRate', 
            'Chol_per_Age', 'HR_pct_max']
cat_cols = ['Gender', 'AgeBucket']

preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first'), cat_cols)
])

pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', LogisticRegression(penalty='l1', solver='saga', max_iter=1000))
])

param_grid = {
    'clf__C': [0.01, 0.1, 1, 10]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc')
grid.fit(df, y)

print("Best AUC:", grid.best_score_)
print("Selected features (non-zero):", grid.best_estimator_['clf'].coef_)

In [None]:
df

In [None]:

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:

print("\nNeural Network")
model = Sequential([
    Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=500, batch_size=32, verbose=1, validation_data=(X_test, y_test))
loss, acc = model.evaluate(X_test, y_test)
print(f"Neural Network Accuracy: {acc:.4f}")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, 
                      edgecolor='none', alpha=0.7, s=40, 
                      cmap=plt.cm.get_cmap('coolwarm', 2))  # 2 класи: 0, 1
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA-візуалізація ознак з розміткою HeartDisease')
plt.colorbar(scatter, ticks=[0, 1], label='HeartDisease')
plt.grid(True)
plt.show()

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

pca = PCA().fit(X)
plt.figure(figsize=(10, 7))
plt.plot(np.cumsum(pca.explained_variance_ratio_), color='k', lw=2)
plt.xlabel('Кількість компонент')
plt.ylabel('Накопичена пояснена дисперсія')
plt.title('Кількість головних компонент PCA vs пояснена дисперсія')
plt.grid(True)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

models = {
    "Random Forest Classifier": RandomForestClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes Classifier": GaussianNB(),
    "Gradient Boosting Classifier": GradientBoostingClassifier()
}


In [None]:
# Hyperparameter Grid
param_grid = {
    "Random Forest Classifier": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "bootstrap": [True, False]
    },
    "Decision Tree Classifier": {
        "criterion": ["gini", "entropy"],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "K-Nearest Neighbors": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"]
    },
    "Logistic Regression": {
        "penalty": ["l1", "l2", "elasticnet", None],
        "C": [0.01, 0.1, 1, 10, 100],
        "solver": ["liblinear", "lbfgs", "saga"]
    },
    "Naive Bayes Classifier": {
        "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
    },
    "Gradient Boosting Classifier": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2, 0.3],
        "max_depth": [3, 5, 10],
        "subsample": [0.7, 0.8, 0.9, 1.0]
    }
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
best_models = {}
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    param_search = RandomizedSearchCV(
        model, param_grid[model_name], 
        n_iter=20, scoring='accuracy', 
        cv=5, random_state=54, n_jobs=-1
    )
    param_search.fit(X_train, y_train)
    best_models[model_name] = param_search.best_estimator_
    print(f"Best parameters for {model_name}: {param_search.best_params_}\n")

In [None]:
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

In [None]:
best_models = {}
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    param_search = GridSearchCV(
        model, param_grid[model_name], scoring='accuracy', 
        cv=5, n_jobs=-1
    )
    param_search.fit(X_train, y_train)
    best_models[model_name] = param_search.best_estimator_
    print(f"Best parameters for {model_name}: {param_search.best_params_}\n")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

accuracy_results = {}

print("\nEvaluation Results:\n")
for model_name, best_model in best_models.items():
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[model_name] = accuracy

    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
plt.figure(figsize=(10, 6))
model_names = list(accuracy_results.keys())
accuracies = list(accuracy_results.values())
sns.barplot(x=accuracies, y=model_names)
plt.xlabel("Accuracy Score")
plt.title("Model Accuracy Comparison")
plt.xlim(0, 1)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

plt.figure(figsize=(10, 8))

for model_name, best_model in best_models.items():
    if hasattr(best_model, "predict_proba"):
        y_score = best_model.predict_proba(X_test)[:, 1]
    elif hasattr(best_model, "decision_function"):
        y_score = best_model.decision_function(X_test)
    else:
        print(f"Model {model_name} does not support ROC.")
        continue

    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

print("\nCross-Validation Results:\n")

cv_scores = {}

for model_name, model in best_models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy', n_jobs=-1)
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    cv_scores[model_name] = mean_score

    print(f"{model_name}: {mean_score:.4f} ± {std_score:.4f}")