In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

In [33]:
def build_pipeline():
    '''
    Builds a pipeline for the Cardiovascular model
    '''

    #Numerical transformer
    num_columns = make_column_selector(dtype_exclude=['object'])

    #Categorical transformer
    General_Health_sorted = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
    age_category_sorted = ['young', 'adult', 'old']

    # Define your categorical and ordinal columns
    ord_cols = ['General_Health', 'age_category']
    cat_cols = ['Smoking_History', 'Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex']

    # Create a ColumnTransformer for preprocessing
    preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('numerical',RobustScaler(),num_columns),
        ('ordinal', OrdinalEncoder(categories=[General_Health_sorted, age_category_sorted], handle_unknown='use_encoded_value', unknown_value=-1), ord_cols),
        ('onehot', OneHotEncoder(drop='if_binary', sparse=False, handle_unknown='ignore'), cat_cols)
    ])

    clf = DecisionTreeClassifier(max_depth=5, min_samples_split=2, min_samples_leaf=1)

    pipeline = make_pipeline(preprocessing_pipeline,clf)

    return pipeline

In [34]:
path = '../../raw_data/CVD_cleaned.csv'

data = pd.read_csv(path)
pipeline = build_pipeline()

In [35]:
def age_process(x):
    '''
    Function created to help the preprocessing of the age category feature
    '''
    if x in ['25-29', '18-24', '30-34']:
        return 'young'
    elif x in ['35-39', '40-44', '45-49', '50-54', '55-59']:
        return 'adult'
    else:
        return 'old'


data.drop_duplicates(inplace=True)

data['age_category'] = data['Age_Category'].map(lambda x : age_process(x))
data.drop(columns=['Age_Category', 'Checkup', 'BMI'], inplace=True)

X = data.drop(columns=['Heart_Disease'],axis=1)
y = data[['Heart_Disease']]
y_encoded =  OneHotEncoder(drop='if_binary', sparse=False, handle_unknown='ignore').fit_transform(y)
pipeline.fit(X,y_encoded)



In [37]:
y_encoded =  OneHotEncoder(drop='if_binary', sparse=False, handle_unknown='ignore').fit_transform(y)



In [38]:
pipeline.predict(X)

array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)