# Type
* Bagging: </br>
  Bagging involves training multiple models in parallel on different subsets of the data and averaging their predictions.
* Boosting: </br>
  Boosting trains models sequentially, each trying to correct the errors of the previous one.

# Example
* Bagging: random forest
* Boosting: gradient boosting classifier

## Load Library

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Load Data

In [2]:
!gdown --id 132GTYyJnbc9vNvDV_yLmQbk4G_BE4iPP -O /content/heart_disease.csv

Downloading...
From: https://drive.google.com/uc?id=132GTYyJnbc9vNvDV_yLmQbk4G_BE4iPP
To: /content/heart_disease.csv
100% 79.3k/79.3k [00:00<00:00, 6.28MB/s]


In [3]:
df = pd.read_csv('heart_disease.csv')

In [4]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [5]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

## Preprocessing

In [6]:
# split dependent and independent variables
X = df.drop('num', axis=1)
y = df['num']

In [7]:
# handling missing value (imputation methods)
# Numerical features
num_features = X.select_dtypes(include=['int64', 'float64']).columns
num_transformer = SimpleImputer(strategy='mean')

# Categorical features
cat_features = X.select_dtypes(include=['object']).columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # categorical to digital data
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

In [8]:
X = preprocessor.fit_transform(X)

In [9]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model

In [11]:
# Random Forest for Bagging
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

# Gradient Boosting for Boosting
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)
gradient_boosting.fit(X_train, y_train)

## Prediction

In [12]:
# Predictions
rf_predictions = random_forest.predict(X_test)
gb_predictions = gradient_boosting.predict(X_test)

## Evaluation

In [15]:
# Evaluating Random Forest Model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average="macro")
rf_recall = recall_score(y_test, rf_predictions, average="macro")
rf_f1 = f1_score(y_test, rf_predictions, average="macro")

# Evaluating Gradient Boosting Model
gb_accuracy = accuracy_score(y_test, gb_predictions)
gb_precision = precision_score(y_test, gb_predictions, average="macro")
gb_recall = recall_score(y_test, gb_predictions, average="macro")
gb_f1 = f1_score(y_test, gb_predictions, average="macro")

# Print the evaluation metrics for both models
print("Random Forest Performance:")
print(f'Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1 Score: {rf_f1}\n')

print("Gradient Boosting Performance:")
print(f'Accuracy: {gb_accuracy}, Precision: {gb_precision}, Recall: {gb_recall}, F1 Score: {gb_f1}')

Random Forest Performance:
Accuracy: 0.6195652173913043, Precision: 0.4044900146293335, Recall: 0.40068376068376066, F1 Score: 0.397376171352075

Gradient Boosting Performance:
Accuracy: 0.6195652173913043, Precision: 0.5250287797089721, Recall: 0.4516980056980057, F1 Score: 0.46958346815234114
