In [None]:
%pip install numpy pandas matplotlib scipy statsmodels

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path('processed.hungarian.data')

# columns from data
cols = [
    'age','sex','cp','trestbps','chol','fbs','restecg','thalach',
    'exang','oldpeak','slope','ca','thal','num'
]

# read CSV, with '?' as NaN
df = pd.read_csv(data_path, header=None, names=cols, na_values=['?'], skipinitialspace=True)

# force numeric columns to be numeric, categorical to categorical 
num_cols = ['age','trestbps','chol','thalach','oldpeak','ca','num']
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df['num'] = df['num'].astype('Int64')
cat_cols = ['sex','cp','fbs','restecg','exang','slope','thal']
for c in cat_cols:
    df[c] = df[c].astype('category')

# Report basic info and missing values
print('Missing values per column:')
missing = df.isna().sum()
print(missing)


In [None]:
# 1b visualization 
import matplotlib.pyplot as plt
import seaborn as sns

plot_df = df.copy()
# vars for visualization
vars_numeric = ['age','trestbps','chol','thalach']
plot_df = plot_df[vars_numeric].dropna()


#histogram: cholesterol, denisty: age, scatter: max HR vs age, boxplot: resting BP vs age quartile
plt.figure(figsize=(12,8))
plt.subplot(2,2,1)
sns.histplot(plot_df['chol'], kde=False, bins=30, color='C0')
plt.title('Histogram: serum cholesterol (chol)')
plt.subplot(2,2,2)
sns.kdeplot(plot_df['age'], fill=True, color='C1')
plt.title('Density: age')
plt.subplot(2,2,3)
sns.scatterplot(x='age', y='thalach', data=plot_df, alpha=0.6)
plt.title('Scatter: age vs. thalach (max heart rate)')
plt.subplot(2,2,4)
plot_df['age_q'] = pd.qcut(plot_df['age'], q=4, duplicates='drop')
sns.boxplot(x='age_q', y='trestbps', data=plot_df)
plt.title('Boxplot: trestbps by age quartile')
plt.tight_layout()

# pairplot paired variables 
sample_df = plot_df.sample(n=min(len(plot_df), 200), random_state=42)
sns.pairplot(sample_df[vars_numeric], diag_kind='kde', plot_kws={'alpha':0.6})
plt.suptitle('Pairplot: numeric variables (sample)')
plt.subplots_adjust(top=0.95)



In [None]:
# 1c simple models (mean max HR)
import numpy as np
import matplotlib.pyplot as plt

# Compute means of thalach for different groups
overall_mean = df['thalach'].mean()
mean_no_disease = df[df['num'] == 0]['thalach'].mean()
mean_with_disease = df[df['num'] > 0]['thalach'].mean()
diff = mean_no_disease - mean_with_disease

print(f'Overall mean thalach: {overall_mean:.3f}')
print(f'Mean thalach (no disease): {mean_no_disease:.3f}')
print(f'Mean thalach (with disease): {mean_with_disease:.3f}')
print(f'Difference (no disease - with disease): {diff:.3f}')

# clean up dataframe, linear regression: thalach vs age
clean_df = df[['age', 'thalach']].dropna()
x = clean_df['age'].values
y = clean_df['thalach'].values 
slope, intercept = np.polyfit(x, y, 1)
print(f'Linear regression (thalach ~ age): slope = {slope:.4f}, intercept = {intercept:.4f}')

# scatter plot w/ fitted line
plt.figure(figsize=(8,6))
plt.scatter(x, y, alpha=0.6, label='data')
x_line = np.linspace(x.min(), x.max(), 100)
y_line = intercept + slope * x_line
plt.plot(x_line, y_line, color='red', linewidth=2, label=f'fit: y={slope:.2f}x+{intercept:.1f}')
plt.xlabel('age')
plt.ylabel('thalach (max heart rate)')
plt.title('Scatter: thalach vs age with linear fit')
plt.legend()
plt.savefig('thalach_vs_age_fit.png')
plt.show()

In [None]:
# 1d 2 models
import matplotlib.pyplot as plt
import numpy as np


#fit regression for no dx group and dx group

no_disease = df[df['num'] == 0][['age', 'thalach']].dropna()
with_disease = df[df['num'] > 0][['age', 'thalach']].dropna()
x_no = no_disease['age'].values
y_no = no_disease['thalach'].values
slope_no, intercept_no = np.polyfit(x_no, y_no, 1)
x_yes = with_disease['age'].values
y_yes = with_disease['thalach'].values
slope_yes, intercept_yes = np.polyfit(x_yes, y_yes, 1)

#print model and plot w/ regression 
print('No disease model: thalach =', f'{slope_no:.2f}', '* age +', f'{intercept_no:.2f}')
print('With disease model: thalach =', f'{slope_yes:.2f}', '* age +', f'{intercept_yes:.2f}')

plt.figure(figsize=(10, 6))
plt.scatter(x_no, y_no, alpha=0.5, label='No Disease', color='blue')
plt.scatter(x_yes, y_yes, alpha=0.5, label='With Disease', color='red')

#add regression
ages = np.linspace(20, 80, 100)
plt.plot(ages, intercept_no + slope_no * ages, 'b-', linewidth=2)
plt.plot(ages, intercept_yes + slope_yes * ages, 'r-', linewidth=2)

plt.xlabel('Age')
plt.ylabel('Max Heart Rate (thalach)')
plt.title('Max Heart Rate vs Age by Disease Status')
plt.legend()
plt.show()