In [5]:
data = 'https://github.com/matisolica/Proyecto-final-DS2/raw/refs/heads/main/framingham_heart_study.csv'
data = pd.read_csv(data, sep=',') # Importo el dataset

In [6]:
# Import essential libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical computations
import seaborn as sns  # For data visualizations
import matplotlib.pyplot as plt  # For plotting
from sklearn.utils import resample # For oversampling

# Import tools for train-test split
from sklearn.model_selection import train_test_split

# Import tools for data preprocessing
from sklearn.impute import KNNImputer  # For imputing missing values using K-Nearest Neighbors
from sklearn.preprocessing import PowerTransformer  # For power transformation (Box-Cox or Yeo-Johnson)
from sklearn.compose import ColumnTransformer  # For applying transformations to specific columns
from sklearn.pipeline import Pipeline  # For creating machine learning pipelines


# Import classification models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier

# Import metrics and PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA


In [8]:
# EDA
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [9]:
data.shape

(4240, 16)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


In [11]:
data.isna().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [12]:
data['TenYearCHD'].value_counts()

0    3596
1     644
Name: TenYearCHD, dtype: int64

In [13]:
# clearly its unbalanced data, so oversample the minority


In [14]:
majority_data= data[ data['TenYearCHD'] == 0 ]
minority_data= data[ data['TenYearCHD'] == 1 ]

In [15]:
majority_data.shape, minority_data.shape

((3596, 16), (644, 16))

In [16]:
upsampled_minority_data= resample(minority_data, replace=True, n_samples=len(majority_data), random_state=42)

In [17]:
# Reset the index of the upsampled minority data
upsampled_minority_data = upsampled_minority_data.reset_index(drop=True)

# Reset index for majority data too (if necessary)
majority_data = majority_data.reset_index(drop=True)

# Combine the datasets into balanced_data
balanced_data = pd.concat([majority_data, upsampled_minority_data]).reset_index(drop=True)

In [18]:
balanced_data.shape

(7192, 16)

In [19]:
balanced_data['TenYearCHD'].value_counts()

0    3596
1    3596
Name: TenYearCHD, dtype: int64

In [20]:
balanced_data['TenYearCHD']

0       0
1       0
2       0
3       0
4       0
       ..
7187    1
7188    1
7189    1
7190    1
7191    1
Name: TenYearCHD, Length: 7192, dtype: int64

In [21]:
# Shuffle the balanced_data
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

In [22]:
# EDA on balanced_data

In [23]:
balanced_data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,59,1.0,0,0.0,0.0,0,0,0,281.0,130.5,86.0,25.11,80.0,83.0,0
1,0,60,3.0,0,0.0,0.0,0,0,0,328.0,127.0,70.0,22.36,75.0,63.0,1
2,0,50,4.0,0,0.0,0.0,0,0,0,243.0,131.0,80.0,23.24,110.0,,0
3,0,38,2.0,1,3.0,0.0,0,0,0,180.0,115.0,86.0,24.91,70.0,,0
4,1,69,1.0,1,1.0,0.0,0,0,0,245.0,123.0,77.0,26.58,70.0,81.0,1


In [24]:
balanced_data.shape

(7192, 16)

In [25]:
balanced_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7192 entries, 0 to 7191
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             7192 non-null   int64  
 1   age              7192 non-null   int64  
 2   education        7029 non-null   float64
 3   currentSmoker    7192 non-null   int64  
 4   cigsPerDay       7154 non-null   float64
 5   BPMeds           7068 non-null   float64
 6   prevalentStroke  7192 non-null   int64  
 7   prevalentHyp     7192 non-null   int64  
 8   diabetes         7192 non-null   int64  
 9   totChol          7116 non-null   float64
 10  sysBP            7192 non-null   float64
 11  diaBP            7192 non-null   float64
 12  BMI              7138 non-null   float64
 13  heartRate        7183 non-null   float64
 14  glucose          6602 non-null   float64
 15  TenYearCHD       7192 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 899.1 KB


In [26]:
balanced_data.isna().sum()

male                 0
age                  0
education          163
currentSmoker        0
cigsPerDay          38
BPMeds             124
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             76
sysBP                0
diaBP                0
BMI                 54
heartRate            9
glucose            590
TenYearCHD           0
dtype: int64

In [27]:
# Get the features and labels
X= balanced_data.drop('TenYearCHD', axis=1)
y= balanced_data['TenYearCHD']

In [28]:
# Identify continuous numeric columns
continuous_numeric_features = [col for col in X.columns if X[col].nunique() >= 10]

# Identify the rest of the numeric features
other_numeric_features = [col for col in X.columns if col not in continuous_numeric_features]

In [29]:
# See the value counts of other_numeric_features
for col in other_numeric_features:
    print(f"{balanced_data[col].value_counts()}\n")

0    3788
1    3404
Name: male, dtype: int64

1.0    3214
2.0    1924
3.0    1106
4.0     785
Name: education, dtype: int64

1    3685
0    3507
Name: currentSmoker, dtype: int64

0.0    6782
1.0     286
Name: BPMeds, dtype: int64

0    7124
1      68
Name: prevalentStroke, dtype: int64

0    4393
1    2799
Name: prevalentHyp, dtype: int64

0    6882
1     310
Name: diabetes, dtype: int64



In [30]:
# Train test split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)

In [31]:
# Define a pipeline for continuous numeric transformations
continuous_numeric_transformer = Pipeline([
    ('knn_imputer', KNNImputer(weights='distance')),  # Impute missing values using KNN with distance weighting
    ('power_transformer', PowerTransformer())  # Apply power transformation for normalization
])


In [32]:
# Define a pipeline for other numeric transformations
other_numeric_transformer = Pipeline([
    ('knn_imputer', KNNImputer(weights='distance'))  # Impute missing values using KNN with distance weighting
])


In [33]:
# Define a ColumnTransformer for preprocessing different numeric feature groups
preprocessor = ColumnTransformer([
    ('continuous_numeric_transformer', continuous_numeric_transformer, continuous_numeric_features),  # Apply continuous numeric transformations
    ('other_numeric_transformer', other_numeric_transformer, other_numeric_features)  # Apply other numeric transformations
])


In [34]:
# Define a list of classification models to evaluate
models = [
    LogisticRegression(), SGDClassifier(), SVC(), KNeighborsClassifier(),
    DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier(),
    AdaBoostClassifier(), LGBMClassifier(), XGBClassifier(), CatBoostClassifier()
]

# Initialize variables to track the best model and F1 score
best_model = None
best_f1 = 0

# Iterate through each model
for model in models:
    # Create a pipeline with the preprocessor and the current model
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Fit the pipeline on training data
    pipe.fit(X_train, y_train)

    # Make predictions on test data
    y_preds = pipe.predict(X_test)

    # Calculate the F1 score
    f1_sc = f1_score(y_test, y_preds)

    # Update the best model if the current F1 score is better
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_model = model

    # Print the model and its F1 score
    print(f"{model} := {f1_sc}\n")


LogisticRegression() := 0.6574202496532594

SGDClassifier() := 0.6512928022361985

SVC() := 0.7088096839273705

KNeighborsClassifier() := 0.8133911965282083

DecisionTreeClassifier() := 0.9047923322683706

RandomForestClassifier() := 0.9725274725274725

GradientBoostingClassifier() := 0.7367003367003367





AdaBoostClassifier() := 0.672627235213205

[LightGBM] [Info] Number of positive: 2883, number of negative: 2870
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1191
[LightGBM] [Info] Number of data points in the train set: 5753, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501130 -> initscore=0.004519
[LightGBM] [Info] Start training from score 0.004519
LGBMClassifier() := 0.881578947368421

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interac

In [35]:
best_model, best_f1

(RandomForestClassifier(), 0.9725274725274725)

In [36]:
# Find the best number of components for PCA based on F1 score
max_f1 = 0
best_comps = 0

# Iterate over possible number of components for PCA (1 to 15)
for i in range(1, 16):
    # Create a pipeline with preprocessor, PCA, and the best model
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('PCA', PCA(n_components=i)),  # Apply PCA with i components
        ('model', best_model),
    ])

    # Fit the pipeline on training data
    pipe.fit(X_train, y_train)
    
    # Make predictions on test data
    y_preds = pipe.predict(X_test)
    
    # Calculate the F1 score
    f1_sc = f1_score(y_test, y_preds)

    # Update best components if the current F1 score is higher
    if f1_sc > max_f1:
        max_f1 = f1_sc
        best_comps = i

    # Print the F1 score for the current number of components
    print(f"{i} comps := {f1_sc}")

# Print the best number of components and the corresponding F1 score
print(f"\n\nBest comps: {best_comps}, F1: {max_f1}")


1 comps := 0.9111969111969112
2 comps := 0.9365918097754293
3 comps := 0.9465954606141522
4 comps := 0.9554655870445344
5 comps := 0.9619565217391305
6 comps := 0.9679180887372013
7 comps := 0.9672578444747613
8 comps := 0.971898560657985
9 comps := 0.9718599862731641
10 comps := 0.9779310344827586
11 comps := 0.9752407152682255
12 comps := 0.9799585348997927
13 comps := 0.9779310344827586
14 comps := 0.9752066115702479
15 comps := 0.9772570640937285


Best comps: 12, F1: 0.9799585348997927


In [37]:
# Create a pipeline with preprocessor, PCA (with best components), and the best model
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('PCA', PCA(n_components=best_comps)),  # Apply PCA with the best number of components
    ('model', best_model)
])

# Fit the pipeline on training data
pipe.fit(X_train, y_train)

# Make predictions on test data
y_preds = pipe.predict(X_test)

# Calculate various performance metrics
acc_sc = accuracy_score(y_test, y_preds)
prc_sc = precision_score(y_test, y_preds)
rcl_sc = recall_score(y_test, y_preds)
f1_sc = f1_score(y_test, y_preds)

# Print the performance metrics
print(f"Accuracy: {acc_sc}\nPrecision: {prc_sc}\nRecall: {rcl_sc}\nF1: {f1_sc}")


Accuracy: 0.9756775538568451
Precision: 0.959349593495935
Recall: 0.9929873772791024
F1: 0.9758787043418332


# **The journey of a thousand lines of code begins with a single keystroke. 🚀✨**