In [37]:
import pandas as pd
import numpy as np   # for numerical operations
import plotly.express as px  # for quick interactive plots
import plotly.graph_objects as go  # for custom Plotly visualizations
from sklearn.model_selection import train_test_split  # for splitting dataset
from sklearn.impute import SimpleImputer  # to handle missing values
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder  # preprocessing tools
from sklearn.compose import ColumnTransformer  # to apply different transforms to columns
from sklearn.pipeline import Pipeline  # to build preprocessing pipelines
from sklearn.metrics import classification_report, cohen_kappa_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from xgboost import XGBClassifier
import plotly.figure_factory as ff
import warnings  # Warnings module is used to manage warnings in the code
warnings.filterwarnings("ignore")  # This line suppresses warnings to avoid clutter in the output

In [38]:
df = pd.read_csv('/content/heart_disease_prediction_dataset.csv')
df

Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholesterol,fasting_bs,rest_ecg,max_hr,exercise_angina,oldpeak,...,ca,thal,glucose,bmi,smoking,alcohol_intake,physical_activity,stress_level,family_history,target
0,67,female,atypical_angina,170,204,0,ST-T_abnormality,182,0,4.4,...,0,fixed_defect,177,25.6,0,0,0,5,1,0
1,57,female,atypical_angina,103,286,1,ST-T_abnormality,161,0,4.0,...,0,reversible_defect,97,18.6,1,0,0,5,0,1
2,43,female,typical_angina,160,166,0,left_ventricular_hypertrophy,130,1,5.7,...,1,normal,193,24.9,1,0,1,1,1,0
3,71,male,non_anginal,175,298,0,ST-T_abnormality,160,0,0.4,...,0,normal,146,35.0,0,0,0,5,0,1
4,36,female,atypical_angina,140,290,1,ST-T_abnormality,125,1,1.0,...,2,fixed_defect,144,36.5,0,1,1,8,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,31,male,asymptomatic,123,192,0,normal,136,0,5.4,...,2,reversible_defect,163,21.4,0,0,1,6,0,0
4996,58,male,typical_angina,174,287,1,ST-T_abnormality,126,0,1.4,...,1,normal,165,20.4,1,0,1,6,0,1
4997,29,male,typical_angina,177,223,0,ST-T_abnormality,124,0,3.9,...,2,reversible_defect,182,39.0,0,1,0,3,0,0
4998,58,male,typical_angina,168,163,1,left_ventricular_hypertrophy,168,1,0.6,...,2,fixed_defect,153,27.3,1,1,0,1,1,0


In [39]:
print("\nData types and missing values:")
print(df.info())  # info about dataset

print("\nSummary statistics:")
print(df.describe())  # summary of numeric features

# Target distribution if present
if 'target' in df.columns:
    print("\nTarget value counts:")
    print(df['target'].value_counts())

# Missing values check
print("\nMissing values per column:")
print(df.isnull().sum())


Data types and missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                5000 non-null   int64  
 1   sex                5000 non-null   object 
 2   chest_pain_type    5000 non-null   object 
 3   resting_bp         5000 non-null   int64  
 4   cholesterol        5000 non-null   int64  
 5   fasting_bs         5000 non-null   int64  
 6   rest_ecg           5000 non-null   object 
 7   max_hr             5000 non-null   int64  
 8   exercise_angina    5000 non-null   int64  
 9   oldpeak            5000 non-null   float64
 10  slope              5000 non-null   int64  
 11  ca                 5000 non-null   int64  
 12  thal               5000 non-null   object 
 13  glucose            5000 non-null   int64  
 14  bmi                5000 non-null   float64
 15  smoking            5000 non-null   int64

In [40]:
# 3. Visualizations (Plotly)
# =========================

# Numeric feature histograms
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
for col in numeric_cols:
    fig = px.histogram(df, x=col, marginal="box", nbins=40,
                       title=f"Distribution of {col}")
    fig.show()

In [41]:
# Correlation heatmap
if len(numeric_cols) > 1:
    corr = df[numeric_cols].corr()
    fig = px.imshow(corr, text_auto=True,
                    title="Correlation Heatmap")
    fig.show()


In [42]:
# Violin plots by target
if 'target' in df.columns:
    for col in numeric_cols[:3]:  # first 3 numeric features
        fig = px.violin(df, y=col, color='target', box=True, points='all',
                        title=f"{col} by Target")
        fig.show()

In [43]:
# --- Pie chart for target variable ---
if 'target' in df.columns:
    fig = px.pie(df, names='target', title='Target Variable Distribution', hole=0.3)
    fig.show()


In [44]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
# --- Pie charts for categorical features ---
for col in categorical_cols:
    fig = px.pie(df, names=col, title=f"Distribution of {col}", hole=0.3)
    fig.show()

In [45]:
# --- Boxplots for numeric vs target ---
if 'target' in df.columns:
    for col in numeric_cols:
        fig = px.box(df, x='target', y=col, color='target',
                     title=f"{col} by Target")
        fig.show()


In [46]:
# --- Grouped bar chart: categorical feature vs target ---
if 'target' in df.columns:
    for col in categorical_cols:
        fig = px.bar(df.groupby([col, 'target']).size().reset_index(name='count'),
                     x=col, y='count', color='target', barmode='group',
                     title=f"{col} by Target")
        fig.show()

In [47]:
# =========================
# 4. Preprocessing
# =========================

if 'target' in df.columns:
    X = df.drop(columns=['target'])
    y = df['target']
else:
    X = df.copy()
    y = None

if y is not None and not np.issubdtype(y.dtype, np.number):
    le = LabelEncoder()
    y = le.fit_transform(y)

num_features = X.select_dtypes(include=np.number).columns.tolist()
cat_features = [c for c in X.columns if c not in num_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical preprocessing: fill missing + one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # updated for sklearn >= 1.2
])

In [48]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

X_preprocessed = preprocessor.fit_transform(X)

ohe_cols = []
if cat_features:
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    ohe_cols = ohe.get_feature_names_out(cat_features).tolist()

final_feature_names = num_features + ohe_cols
X_proc_df = pd.DataFrame(X_preprocessed, columns=final_feature_names)

print("\nPreprocessed features preview:")
print(X_proc_df.head())

print(f"\nOriginal shape: {X.shape}, Preprocessed shape: {X_preprocessed.shape}")


Preprocessed features preview:
        age  resting_bp  cholesterol  fasting_bs    max_hr  exercise_angina  \
0  1.043233    1.370466    -0.485823   -1.018572  1.053715        -0.982159   
1  0.318212   -1.211959     1.395699    0.981766  0.338084        -0.982159   
2 -0.696818    0.985029    -1.357747   -1.018572 -0.718323         1.018165   
3  1.333242    1.563184     1.671044   -1.018572  0.304007        -0.982159   
4 -1.204333    0.214156     1.487481    0.981766 -0.888711         1.018165   

    oldpeak     slope        ca   glucose  ...  chest_pain_type_asymptomatic  \
0  0.748355  1.223238 -1.354006  1.102638  ...                           0.0   
1  0.524216 -0.010362 -1.354006 -1.025093  ...                           0.0   
2  1.476806  1.223238 -0.461333  1.528184  ...                           0.0   
3 -1.493034  1.223238 -1.354006  0.278142  ...                           0.0   
4 -1.156825  1.223238  0.431340  0.224949  ...                           0.0   

   chest_pai

In [49]:
# 3. Define models & hyperparameters
# =========================
models = {
    'KNN': (KNeighborsClassifier(), {
        'model__n_neighbors': [3,7],
        'model__weights': ['uniform', 'distance'],
        'model__metric': ['minkowski', 'euclidean']
    }),
    'SGD': (SGDClassifier(max_iter=1000, tol=1e-3), {
        'model__loss': ['modified_huber'],
        'model__alpha': [0.0001],
        'model__penalty': ['l1', 'elasticnet']
    }),
    'XGB': (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
        'model__n_estimators': [100, 200],
        'model__max_depth': [3],
        'model__learning_rate': [0.2],
        'model__subsample': [0.8]
    }),
    'LogisticRegression': (LogisticRegression(max_iter=500), {
        'model__C': [0.01, 0.1, 1, 10],
        'model__penalty': ['l2'],
        'model__solver': [ 'liblinear']
    })
}

# =========================
# 4. Grid Search & Training
# =========================
best_models = {}

for name, (model, params) in models.items():
    print(f"\n=== Grid Search for {name} ===")
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])
    grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X, y)  # Using full dataset
    print(f"Best Parameters: {grid.best_params_}")
    print(f"Best CV Accuracy: {grid.best_score_:.4f}")
    best_models[name] = grid.best_estimator_



=== Grid Search for KNN ===
Best Parameters: {'model__metric': 'minkowski', 'model__n_neighbors': 3, 'model__weights': 'uniform'}
Best CV Accuracy: 0.5426

=== Grid Search for SGD ===
Best Parameters: {'model__alpha': 0.0001, 'model__loss': 'modified_huber', 'model__penalty': 'l1'}
Best CV Accuracy: 0.5796

=== Grid Search for XGB ===
Best Parameters: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 0.8}
Best CV Accuracy: 0.5660

=== Grid Search for LogisticRegression ===
Best Parameters: {'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best CV Accuracy: 0.5980


In [50]:
# 5. Final Evaluation
# =========================
for name, model in best_models.items():
    print(f"\n=== Final Model Evaluation: {name} ===")
    y_pred = model.predict(X)  # Predictions on the same dataset
    print(classification_report(y, y_pred))
    print("Kappa Score:", cohen_kappa_score(y, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y, y_pred)
    labels = sorted(np.unique(y))
    fig = ff.create_annotated_heatmap(
        z=cm,
        x=[f"Pred {l}" for l in labels],
        y=[f"True {l}" for l in labels],
        colorscale='Blues'
    )
    fig.update_layout(title=f"Confusion Matrix - {name}")
    fig.show()


=== Final Model Evaluation: KNN ===
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      2994
           1       0.75      0.66      0.70      2006

    accuracy                           0.77      5000
   macro avg       0.77      0.76      0.76      5000
weighted avg       0.77      0.77      0.77      5000

Kappa Score: 0.520260693138888



=== Final Model Evaluation: SGD ===
              precision    recall  f1-score   support

           0       0.60      0.50      0.55      2994
           1       0.40      0.50      0.45      2006

    accuracy                           0.50      5000
   macro avg       0.50      0.50      0.50      5000
weighted avg       0.52      0.50      0.51      5000

Kappa Score: 0.005771329326960006



=== Final Model Evaluation: XGB ===
              precision    recall  f1-score   support

           0       0.69      0.94      0.80      2994
           1       0.81      0.36      0.49      2006

    accuracy                           0.71      5000
   macro avg       0.75      0.65      0.65      5000
weighted avg       0.74      0.71      0.67      5000

Kappa Score: 0.331556444896945



=== Final Model Evaluation: LogisticRegression ===
              precision    recall  f1-score   support

           0       0.60      1.00      0.75      2994
           1       0.33      0.00      0.00      2006

    accuracy                           0.60      5000
   macro avg       0.47      0.50      0.37      5000
weighted avg       0.49      0.60      0.45      5000

Kappa Score: -0.0002029310580602406
