## **Heart Disease Dataset: Clinical Features for Predicting Heart Disease Presence**

In [None]:

import pandas as pd
import os
import kagglehub  #downloading dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# imports i used Supervised learning models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# for Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# imports i used Unsupervised learning and dimensionality reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Visualization
import matplotlib.pyplot as plt


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

print("Path to dataset files:", path)


In [None]:
import pandas as pd
import os
import kagglehub

# Download dataset
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")
csv_path = os.path.join(path, "heart.csv")

# Load dataset into df
df = pd.read_csv(csv_path)

# print
print("Data loaded successfully! Shape:", df.shape)
df.head()



In [None]:
# View column info and data types
df.info()

# Statistical summary of numerical features
df.describe()

# Check for missing values
df.isnull().sum()

# Check for duplicate rows
df.duplicated().sum()


In [None]:
print(' the Dataset Description is :')
df.describe()

In [None]:
# Split features and target
X = df.drop('target', axis=1)
y = df['target']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

### 1. **Supervised Learning** using Classification by Logistic regression

In [None]:
# Feature importance from the trained model
feature_importance = lr.coef_[0]  # using lr

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importance)), feature_importance, color='lightgreen')
plt.xlabel('Feature Index')
plt.ylabel('Importance (Coefficient Value)')
plt.title('Feature Importance using Logistic Regression')
plt.show()  # correctly displays the plot


### 1.2 Using Naive Bayes:

In [None]:
# Naive Bayes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Fixing the target name
df.rename(columns={'num': 'target', 'output': 'target'}, inplace=True)

X = df.drop(columns=['target'])
y = df['target']

# now will Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# The Results:
print("Naive Bayes Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


### 1.3 Correlation Heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))
corr = df.corr()  # Correlation matrix
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Feature Correlation Heatmap')
plt.show()


### **2. Unsupervised Learning** using Clustering K-means

In [None]:
###  K-Means Clustering with counts and contingency table

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

# Features only no target
X = df.drop(columns=['target'])

# Step 1: Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply K-Means with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Step 3: Add cluster labels to dataset
df['Cluster'] = clusters

# Step 4: Cluster label counts
print("Cluster label counts:")
print(df['Cluster'].value_counts())

# Step 5: Contingency table: clusters vs target
print("\nContingency Table (Cluster vs Target):")
print(pd.crosstab(df['Cluster'], df['target']))

# Step 6: PCA for 2D visualization
X_pca = PCA(n_components=2).fit_transform(X_scaled)

# Step 7: Colors pink & blue for clusters
colors = ['pink' if label == 0 else 'blue' for label in clusters]

# Step 8: Plot clusters in 2D
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=colors, edgecolor='k', s=50)
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.title('K-Means Clustering of Heart Disease Data (2D PCA)')
plt.show()



In [None]:
# i will Choose number for the clusters (k)let say for ex =2
k_choice = 2

# Fit K-Means with chosen k
kmeans = KMeans(n_clusters=k_choice, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataset
df['Cluster'] = clusters

# Cluster label counts
print(f"Cluster label counts (k={k_choice}):")
print(df['Cluster'].value_counts())

# Contingency table: clusters vs target
print("\nContingency Table (Cluster vs Target):")
print(pd.crosstab(df['Cluster'], df['target']))


### Dimensionality Reduction (PCA)


In [None]:
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")
df = pd.read_csv(os.path.join(path, "heart.csv"))
df.rename(columns={'num':'target','output':'target'}, inplace=True)

# Features and target
X = df.drop('target', axis=1)
y = df['target']

# Standardize and apply PCA
X_pca = PCA(n_components=2).fit_transform(StandardScaler().fit_transform(X))

# i Maped target values to colors: 0 = will be green (no disease), and 1 = light red (disease)
colors = ['green' if val==0 else 'lightcoral' for val in y]

# Plot PCA with custom colors
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=colors, edgecolor='k', s=50)
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.title('Heart Disease PCA in 2D Projection')
plt.show()

### **Summery Report:**

In this Assigment , i analyzed the Heart Disease Dataset from Kaggle to predict the presence of heart disease and explore patterns in the data. First, we conducted an initial examination, checking the datasetâ€™s structure, data types, statistical summaries, and ensuring there were no missing or duplicate entries.

Features and target were separated for analysis (X for features, y for target).

1. Then we used **Supervised learning**:
I Applied Logistic Regression and Naive Bayes classifiers

in **Logistic Regression**
Train/test split: 80/20, features scaled.
Accuracy: 80%, balanced precision and recall.
Feature importance shown in a light green bar chart.
and in **Naive Bayes**
Same split and scaling.
But, the Accuracy: 78 - 80%, reasonable predictive performance.

2. In **unsupervised learning**, I performed K-Means clustering with two clusters, evaluated cluster label counts, and generated a contingency table comparing clusters to the actual target.

so the Cluster label counts displayed.
and the Contingency table comparing clusters to actual target shows relationship patterns.
additonaly to the 2D Visualization:
PCA used to reduce (dimensions to 2).
Clusters are plotted with pink and blue for visualization.


3. lastley, **Dimensionality Reduction PCA** is applied :
PCA applied to the dataset for 2D projection.
Visualization:
Patients without disease are in green, with disease in light red.
Clear separation patterns observed, supporting clustering and classification insights.

To sum up: Successfully implemented supervised (Logistic Regression, Naive Bayes) and unsupervised (K-Means) learning.
Feature scaling and PCA improved visualization and model performance.
Clustering results complemented supervised models by revealing patterns in the data.
Overall, the analysis demonstrates accurate predictions and interpretable insights into heart disease risk factors.
