In [1]:
# Cell 1: Imports & Load Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

df = pd.read_csv('FIC.Full CSV.csv')
df.head()


Unnamed: 0,Age,Age.Group,Gender,Locality,Marital status,Life.Style,Sleep,Category,Depression,Hyperlipi,...,oldpeak,slope,ca,thal,num,SK,SK.React,Reaction,Mortality,Follow.Up
0,45,41-50,Female,RURAL,MARRIED,NO,NO,FREE,YES,YES,...,3.0,2,0,7,2,1,NO,0,0,60
1,51,51-60,Female,URBAN,MARRIED,NO,NO,FREE,YES,YES,...,1.2,2,0,7,2,1,NO,0,0,15
2,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,3.4,2,0,3,2,1,NO,0,0,6
3,55,51-60,Female,RURAL,MARRIED,YES,YES,FREE,YES,YES,...,2.0,2,1,7,3,1,NO,0,0,52
4,56,51-60,Female,RURAL,MARRIED,YES,NO,FREE,YES,YES,...,4.0,3,2,7,3,1,NO,0,0,34


In [2]:
# Cell 2: Exploratory Data Analysis
#  - structure, dtypes, missing, duplicates, descriptive stats
print("Shape:", df.shape)
print(df.info())
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())
df.describe(include='all').T


Shape: (368, 60)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 60 columns):
 #   Column                                                                                    Non-Null Count  Dtype  
---  ------                                                                                    --------------  -----  
 0   Age                                                                                       368 non-null    int64  
 1   Age.Group                                                                                 368 non-null    object 
 2   Gender                                                                                    368 non-null    object 
 3   Locality                                                                                  368 non-null    object 
 4   Marital status                                                                            368 non-null    object 
 5   Life.Style                              

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Age,368.0,,,,54.293478,8.718158,24.0,50.75,55.0,60.25,77.0
Age.Group,368.0,5.0,51-60,184.0,,,,,,,
Gender,368.0,2.0,Male,285.0,,,,,,,
Locality,368.0,2.0,URBAN,234.0,,,,,,,
Marital status,368.0,2.0,MARRIED,365.0,,,,,,,
Life.Style,368.0,2.0,YES,217.0,,,,,,,
Sleep,368.0,2.0,NO,224.0,,,,,,,
Category,368.0,2.0,FREE,331.0,,,,,,,
Depression,368.0,2.0,YES,351.0,,,,,,,
Hyperlipi,368.0,2.0,YES,341.0,,,,,,,


In [3]:
# Cell 3: Define Features & Target
target = 'Mortality'
X = df.drop(columns=[target])
y = df[target]

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)


Numeric columns: ['Age', 'F.History', 'Diabetes', 'BP', 'Thrombolysis', 'BGR', 'B.Urea', 'S.Cr', 'S.Sodium', 'S.Potassium', 'S.Chloride', 'C.P.K', 'CK.MB', 'ESR', 'WBC', 'RBC', 'Hemoglobin', 'P.C.V', 'M.C.V', 'M.C.H', 'M.C.H.C', 'PLATELET_COUNT', 'NEUTROPHIL', 'LYMPHO', 'MONOCYTE', 'EOSINO', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num', 'SK', 'Reaction', 'Follow.Up']
Categorical columns: ['Age.Group', 'Gender', 'Locality  ', 'Marital status                       ', 'Life.Style                                                                              ', 'Sleep', 'Category', 'Depression', 'Hyperlipi', 'Smoking', 'Family.History', 'HTN', 'Allergies', 'Others ', 'CO', 'Diagnosis', 'Hypersensitivity', 'SK.React']


In [4]:
# Cell 4: Preprocessing Pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])


In [5]:
# Cell 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [6]:
# Cell 6: Models & Hyperparameter Grids
models = {
    'Decision Tree': (
        DecisionTreeClassifier(random_state=42),
        {
            'clf__max_depth': [None, 5, 10],
            'clf__min_samples_split': [2, 10],
            'clf__min_samples_leaf': [1, 5]
        }
    ),
    'Random Forest': (
        RandomForestClassifier(random_state=42),
        {
            'clf__n_estimators': [100, 200],
            'clf__max_depth': [None, 10],
            'clf__min_samples_leaf': [1, 2]
        }
    ),
    'KNN': (
        KNeighborsClassifier(),
        {
            'clf__n_neighbors': [3, 5, 7],
            'clf__weights': ['uniform', 'distance']
        }
    ),
    'Logistic Regression': (
        LogisticRegression(solver='lbfgs', max_iter=1000),
        {
            'clf__C': [0.01, 0.1, 1, 10],
            'clf__penalty': ['l2']
        }
    )
}


In [7]:
# Cell 7: GridSearchCV Training & Evaluation
results = []
for name, (model, params) in models.items():
    pipe = Pipeline([('preprocessor', preprocessor), ('clf', model)])
    grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train, y_train)
    best = grid.best_estimator_
    y_pred = best.predict(X_test)
    y_proba = best.predict_proba(X_test)[:, 1] if hasattr(best, "predict_proba") else None

    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else None
    })

In [8]:
# Cell 8: Show full Best Params without truncation
import pandas as pd
pd.set_option('display.max_colwidth', None)
results_df = pd.DataFrame(results).set_index('Model')
results_df


Unnamed: 0_level_0,Best Params,Accuracy,Precision,Recall,F1 Score,ROC AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Decision Tree,"{'clf__max_depth': None, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 2}",0.959459,0.842105,1.0,0.914286,0.969289
Random Forest,"{'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 200}",0.972973,0.9375,0.9375,0.9375,0.996767
KNN,"{'clf__n_neighbors': 7, 'clf__weights': 'distance'}",0.945946,0.875,0.875,0.875,0.954741
Logistic Regression,"{'clf__C': 10, 'clf__penalty': 'l2'}",0.972973,0.9375,0.9375,0.9375,0.929957


In [9]:
# Cell 9: Random Forest Feature Importances
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# 1) Retrieve the best RF params
rf_params = results_df.loc['Random Forest', 'Best Params']

# 2) Build pipeline with those params
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=rf_params['clf__n_estimators'],
        max_depth=rf_params['clf__max_depth'],
        min_samples_leaf=rf_params.get('clf__min_samples_leaf', 1),
        min_samples_split=rf_params.get('clf__min_samples_split', 2),
        random_state=42
    ))
])

# 3) Fit on training data
rf_pipe.fit(X_train, y_train)

# 4) Extract and display feature importances
importances = rf_pipe.named_steps['clf'].feature_importances_
feat_names = rf_pipe.named_steps['preprocessor'].get_feature_names_out()

imp_df = pd.DataFrame({
    'Feature': feat_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nTop 10 Random Forest Features:\n")
print(imp_df.head(10))



Top 10 Random Forest Features:

                 Feature  Importance
0               num__Age    0.074577
28             num__chol    0.074430
44  cat__Age.Group_51-60    0.053072
43  cat__Age.Group_41-50    0.047663
27         num__trestbps    0.037910
31          num__thalach    0.037685
35               num__ca    0.034614
40        num__Follow.Up    0.033008
33          num__oldpeak    0.032655
2          num__Diabetes    0.024692
