In [235]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

# import about model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

In [236]:
# heart_2020
heart_2020_cleaned = pd.read_csv('/Users/taijieshengwu/2024Research_SU/Data/2020/heart_2020_cleaned.csv')

# heart_2022
heart_2022 = pd.read_csv('/Users/taijieshengwu/2024Research_SU/Data/2022/heart_2022_no_nans.csv')

# heart
heart = pd.read_csv('/Users/taijieshengwu/2024Research_SU/Data/heart.csv')

heart_2020_cleaned = heart_2020_cleaned.rename(columns={"HeartDisease":"output"})
heart_2022 = heart_2022.rename(columns={"HadHeartAttack":"output"})

In [237]:
def create_pipeline(X):
    # Determine feature types
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
    categorical_features.remove('output')
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    # Fit the pipeline to the data
    X_transformed = preprocessor.fit_transform(X)
    
    # Generate new column names for one-hot encoded features
    encoded_categorical_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_feature_names = numeric_features + list(encoded_categorical_features)
    
    # Create a DataFrame with the new feature names
    X_transformed = pd.DataFrame(X_transformed, columns=all_feature_names)
    X_transformed = pd.concat([X_transformed, X["output"].reset_index(drop=True)], axis=1)

    return X_transformed

heart_2020 = create_pipeline(heart_2020_cleaned)

In [238]:
import numpy as np
import pandas as pd

def over_sampling(X, random_state=42):
    """
    Oversamples the minority class in the 'output' column to balance the classes.
    
    Parameters:
    X (DataFrame): The dataset containing the 'output' column.
    random_state (int): Seed used by the random number generator.
    
    Returns:
    X_resampled (DataFrame): Resampled dataset with balanced classes in the 'output' column.
    """
    np.random.seed(random_state)
    
    # Ensure 'output' column is present
    if 'output' not in X.columns:
        raise ValueError("The dataset must contain an 'output' column.")
    
    # Separate the majority and minority classes
    class_counts = X['output'].value_counts()
    majority_class = class_counts.idxmax()
    minority_class = class_counts.idxmin()
    
    X_majority = X[X['output'] == majority_class]
    X_minority = X[X['output'] == minority_class]
    
    # Calculate the number of samples to add to the minority class
    n_majority = len(X_majority)
    n_minority = len(X_minority)
    n_to_add = n_majority - n_minority
 
    indices_to_add = np.random.choice(X_minority.index, size=n_to_add, replace=True)
    X_oversampled = X_minority.loc[indices_to_add]
    X_resampled = pd.concat([X_majority, X_minority, X_oversampled])

    X_resampled = X_resampled.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return X_resampled

print("Original heart_2020 shape:", heart_2020.shape)
print("Original 'output' class distribution:")
print(heart_2020['output'].value_counts())

heart_2020_resampled = over_sampling(heart_2020)

print("Resampled heart_2020 shape:", heart_2020_resampled.shape)
print("Resampled 'output' class distribution:")
print(heart_2020_resampled['output'].value_counts())

Original heart_2020 shape: (319795, 51)
Original 'output' class distribution:
output
No     292422
Yes     27373
Name: count, dtype: int64
Resampled heart_2020 shape: (584844, 51)
Resampled 'output' class distribution:
output
Yes    292422
No     292422
Name: count, dtype: int64


In [239]:
ouput_map = {
    'Yes':1,
    'No':0
}
heart_2020_resampled['output1'] = heart_2020_resampled['output'].map(ouput_map)
heart_2020_resampled = heart_2020_resampled.drop(columns=['output'])
heart_2020_resampled = heart_2020_resampled.rename(columns={"output1":"output"})
heart_2020_resampled


Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,Stroke_No,Stroke_Yes,...,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer_No,SkinCancer_Yes,output
0,0.779505,0.833659,-0.490039,-1.460354,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1
1,1.517380,-0.424070,-0.490039,-0.067601,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
2,-0.647473,-0.046751,0.012776,-0.067601,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0
3,0.147040,-0.424070,-0.490039,0.628776,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
4,-0.276176,-0.172524,-0.112928,0.628776,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
584839,1.045391,0.204795,2.024033,-0.763977,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0
584840,0.471139,-0.172524,-0.238631,-0.763977,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
584841,-1.383775,-0.424070,-0.490039,0.628776,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1
584842,-0.307642,2.217161,-0.490039,-0.763977,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0


In [242]:
# Split X and y -- StratifiedKFold validation
X_2020 = heart_2020_resampled.drop(columns='output')  # type: ignore
y_2020 = heart_2020_resampled['output']  # type: ignore

def train_validate_model(n_splits, X_train, y_train, model_name):
    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=45).split(X_train, y_train)
    rmse_list = []

    for train_idx, val_idx in folds:
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        if model_name == 'random forest':
            model = RandomForestRegressor(random_state=45)
        elif model_name == 'decision tree':
            model = DecisionTreeRegressor(random_state=45)
        elif model_name == 'logistic regression':
            model = LogisticRegression(random_state=45, max_iter=1000)
        # elif model_name == 'SVM':
        #     model = SVR()
        elif model_name == 'XGBoost':
            model = XGBRegressor(random_state=45)
        elif model_name == 'Gradient Boosting':
            model = GradientBoostingRegressor(random_state=45)
        elif model_name == 'AdaBoost':
            model = AdaBoostRegressor(random_state=45)
        elif model_name == 'LightGBM':
            model = LGBMRegressor(random_state=45)
        elif model_name == 'CatBoost':
            model = CatBoostRegressor(random_state=45, verbose=0)
        elif model_name == 'Ridge':
            model = Ridge(random_state=45)
        elif model_name == 'Lasso':
            model = Lasso(random_state=45)
        else:
            raise ValueError(f"Unknown model name: {model_name}")

        # Train the model
        model.fit(X_tr, y_tr)
        
        # Predict on the validation set
        y_pred = model.predict(X_val)
        
        # Calculate RMSE
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        rmse_list.append(rmse)
        
        print(f'Fold RMSE: {rmse}')
    
    average_rmse = np.mean(rmse_list)
    print(f'Average RMSE: {average_rmse}')
    
    return average_rmse

def plot_rmse(models, rmses):
    plt.figure(figsize=(12, 6))
    plt.bar(models, rmses, color='skyblue')
    plt.xlabel('Models')
    plt.ylabel('Average RMSE')
    plt.title('Average RMSE for Different Models')
    for i, v in enumerate(rmses):
        plt.text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom')
    plt.xticks(rotation=45)
    plt.show()

models = ['random forest', 'decision tree', 'logistic regression', 'XGBoost',
          'Gradient Boosting', 'AdaBoost', 'LightGBM', 'CatBoost', 'Ridge', 'Lasso'] #, 'SVM'
average_rmses = []

for model_name in models:
    print(f"Training and validating {model_name} model")
    average_rmse = train_validate_model(n_splits=5, X_train=X_2020, y_train=y_2020, model_name=model_name)
    average_rmses.append(average_rmse)
    print(f'Final Average RMSE for {model_name}: {average_rmse}\n')

plot_rmse(models, average_rmses)


Training and validating random forest model


## question
1. unbalanced dataset leading to the poorer performance to "YES", how to solve

raja: 
undersampling 
smote oversampling, make it balanced
meanwhile control the size of three datasets to make them close


3. we need other models, 
4. in previous study, how do they compare the importance of personal behaviors and accurate numbers


5. how to measure the formance of model,
--regression: rmse

question 0807
我有一个想法，不用那个小数据集，直接研究2020和2022的大数据集，研究影响心脏病的因素和影响因素的变化趋势--问问助教

