# 1. Data Exploration:

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load dataset
data = pd.read_csv('heart_disease_uci.csv')

# Display first 5 rows
print(data.head())

# Check for missing values and data types
print(data.info())
print(data.isnull().sum())


FileNotFoundError: [Errno 2] No such file or directory: 'heart_disease_uci.csv'

# 2. Handling Missing Data:

In [None]:
# Impute missing values in 'thal' and 'ca' columns with the most frequent value
data['thal'].fillna(data['thal'].mode()[0], inplace=True)
data['ca'].fillna(data['ca'].mode()[0], inplace=True)

# 3. Feature Creation


In [None]:
# Age Group
data['AgeGroup'] = np.where(data['age'] < 40, '<40', np.where(data['age'] <= 60, '40-60', '>60'))

# Cholesterol Level
data['CholLevel'] = np.where(data['chol'] < 200, 'Low', np.where(data['chol'] <= 240, 'Normal', 'High'))

# IsRisk Binary Feature
data['IsRisk'] = np.where((data['chol'] > 240) | (data['trestbps'] > 140) | (data['age'] > 60), 1, 0)


# 4. Feature Transformation


In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Label Encoding for categorical features
label_cols = ['sex', 'cp', 'thal', 'AgeGroup']
for col in label_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# Normalizing numerical features
scaler = MinMaxScaler()
data[['chol', 'trestbps', 'thalch']] = scaler.fit_transform(data[['chol', 'trestbps', 'thalch']])


# 5. Feature Interaction


In [None]:
# BP-Chol Interaction
data['BP_Chol_Interaction'] = data['trestbps'] * data['chol']

# Exercise-Induced Angina Risk
data['ExAngina_Risk'] = ((data['exang'] == 1) & (data['thalch'] < 100)).astype(int)

# 6. Feature Selection (Optional)


In [None]:
# Here you might drop features based on correlation or feature importance.
data = data.drop(['oldpeak'], axis=1)  # Example; adjust based on further analysis
