In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
heart_disease = pd.read_csv('/kaggle/input/k-means-clustering-for-heart-disease-analysis/heart_disease.csv')

In [None]:
heart_disease

In [None]:
numerical_cols = heart_disease.select_dtypes(include=["int64","float64"]).columns
categorical_cols = heart_disease.select_dtypes(include=["object"]).columns

In [None]:
ohe_cols = [col for col in categorical_cols if heart_disease[col].nunique() <= 3]
ordinal_cols = [col for col in categorical_cols if 3 < heart_disease[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_cols if heart_disease[col].nunique() >= 10]

In [None]:
from sklearn.impute import SimpleImputer

num_cols_transformer = SimpleImputer(strategy='median')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder

ohe_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

ordinal_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
])
    
high_cardinality_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('target', TargetEncoder())
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_cols_transformer, numerical_cols),
        ('ohe', ohe_cols_transformer, ohe_cols),
        ('ord', ordinal_cols_transformer, ordinal_cols),
        ('hcc', high_cardinality_cols_transformer, high_cardinality_cols)
    ]
)

In [None]:
from sklearn.preprocessing import StandardScaler

X = preprocessor.fit_transform(heart_disease)

X_scaled = StandardScaler().fit_transform(X)

In [None]:
from sklearn.cluster import KMeans

# 2 clusters: patient has heart disease or not
model = KMeans(n_clusters=3, random_state=0)

In [None]:
# predict the clusters
clusters = model.fit_predict(X_scaled)

heart_disease['cluster'] = clusters

In [None]:
# get the final predictions
sample = pd.read_csv('/kaggle/input/k-means-clustering-for-heart-disease-analysis/sample.csv')

sample_clusters = heart_disease.loc[heart_disease['id'].isin(sample['id']), ['id', 'cluster']]

submission = pd.DataFrame({
    'id': sample_clusters['id'],
    'cluster': sample_clusters['cluster']
})

submission.to_csv('submission.csv', index=False)