# Predicting Heart Disease

Link to Competittion: https://www.kaggle.com/competitions/playground-series-s6e2/overview

## Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams['figure.figsize'] = (12, 6)

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)

import xgboost as xgb

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.cluster import KMeans

from common import *

In [None]:
from platform import python_version
print('python: ', python_version())
print('pandas: ', pd.__version__)
print('numpy: ', np.__version__)
import matplotlib
print('matplotlib: ', matplotlib.__version__)
print('seaborn: ', sns.__version__)
import sklearn
print('sklearn: ', sklearn.__version__)
print('xgboost: ', xgb.__version__)

## Helpers

## Load data

In [None]:
train_df = pd.read_csv('archive/train.csv')
test_df = pd.read_csv('archive/test.csv')

In [None]:
train_df

### Look for NaN values

In [None]:
train_df.isnull().sum() 

### Look for duplicates

In [None]:
train_df.duplicated().sum()

## Call the pipeline

In [None]:
df = (train_df
          .pipe(copy_data)
          .pipe(clean_data)
          # .pipe(remove_outliers)
          .pipe(remove_duplicates)
          .pipe(make_new_features)
           )

## Features

In [None]:
target = get_target()

In [None]:
features = get_features(df)

In [None]:
features

In [None]:
categorical_features = []

In [None]:
numerical_features = [f for f in features if f not in categorical_features]

In [None]:
categorical_features

In [None]:
numerical_features

### Look at column types

In [None]:
df.info()

## Describe

In [None]:
df.describe()

In [None]:
for cat in categorical_features:
    sns.barplot(data=df[cat].value_counts(), orient='h')

## Distributions (train vs test)

In [None]:
for col in train_df.columns:
    sns.histplot(train_df[col])
    if col in test_df.columns:
        sns.histplot(test_df[col])
    plt.show()

## Frequency Encoding

In [None]:
for col in train_df:
    train_df[f'{col}_freq'] = train_df[col].map(train_df[col].value_counts())

In [None]:
train_df

In [None]:
for col in categorical_features:
      print(f"{col}:\n{train_df[col].value_counts()}\n")

## Bar Charts

## Kmeans

## Scatter Plots

## Correlation Plots

In [None]:
df['heart_disease'].value_counts()

In [None]:
corr_matrix = df.corr(numeric_only=True)

mask = abs(corr_matrix) < 0.15

sns.heatmap(data=corr_matrix, mask=mask, fmt='.2f', annot=True)

## Looking for interactions

In [None]:
corr_matrix[target].sort_values(ascending=False)

In [None]:
df.groupby('thallium')[target].value_counts()

In [None]:
sns.histplot(data=df, x='thallium', hue=target) 

In [None]:
df.groupby('max_hr')[target].value_counts()

In [None]:
sns.histplot(data=df, x='max_hr', hue=target, kde=True)

In [None]:
df.groupby('chest_pain_type')[target].value_counts()

In [None]:
sns.histplot(data=df, x='chest_pain_type', hue=target)

In [None]:
for cat in categorical_features:
    print(df.groupby(cat)['exam_score'].mean().sort_values())

## Compare train vs orig

In [None]:
train_df.describe()