In [14]:
from sklearn.impute import SimpleImputer
import pandas as pd

df = pd.read_csv("framingham.csv")
print(df.isnull().sum())


male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64


In [19]:
numeric_cols = ['cigsPerDay', 'totChol', 'BMI', 'heartRate', 'glucose'] #continuous variables
categorical_cols = ['education', 'BPMeds'] #categorical variables

num_imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

print(df.isnull().sum())

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64


In [None]:
#no need to deal w outliers as we're using xgboost model and it can deal with outliers

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scale_cols = ['age', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
df[scale_cols] = scaler.fit_transform(df[scale_cols])

In [21]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Separate features and target
X = df.drop('TenYearCHD', axis=1)
y = df['TenYearCHD']

# Apply SMOTE to the full dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled features and target into one DataFrame
resampled_df = pd.concat([
    pd.DataFrame(X_resampled, columns=X.columns),
    pd.DataFrame(y_resampled, columns=['TenYearCHD'])
], axis=1)

# Save to CSV
resampled_df.to_csv('resampled.csv', index=False)
