In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
#load data set
url = "healthcare-dataset-stroke-data.csv"
data = pd.read_csv(url)

In [None]:
print(data.head())

      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None


In [None]:
# checking for the null values of each column

for each in data.columns:
    print(f'There are {data[each].isnull().sum()} null values in the {each} column')

There are 0 null values in the id column
There are 0 null values in the gender column
There are 0 null values in the age column
There are 0 null values in the hypertension column
There are 0 null values in the heart_disease column
There are 0 null values in the ever_married column
There are 0 null values in the work_type column
There are 0 null values in the Residence_type column
There are 0 null values in the avg_glucose_level column
There are 201 null values in the bmi column
There are 0 null values in the smoking_status column
There are 0 null values in the stroke column


In [None]:
data.dropna(inplace=True)
data = pd.get_dummies(data, drop_first=True)

X = data.drop(columns=['stroke'])
y = data['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

loading and cleaning data
- we load the dataset and remove rows with missing values using dropna
- we use pd.get_dummies to convert categorical columns into numerical representations for easier processing

feature and target Selection
- we define X as the features and y as the target (stroke)

Data Splitting
- using train_test_split, we create training and testing sets with 80-20 split

In [None]:
class DecisionTreeCustom:
    def __init__(self, maxDepth=3):
        self.maxDepth = maxDepth
        self.tree = []

    def fit(self, X, y, depth=0):
        if depth < self.maxDepth:
            best_mse = float('inf')
            n_samples, n_features = X.shape
            for feature in range(n_features):
                feature_vals = np.unique(X[:, feature])
                for threshold in feature_vals:
                    preds = np.where(X[:, feature] > threshold,
                                     np.full(y.shape, y.mean()),
                                     np.full(y.shape, y.mean() - y.std()))
                    mse = np.mean((y - preds) ** 2)
                    if mse < best_mse:
                        best_mse = mse
                        self.featureIndex = feature
                        self.threshold = threshold
            self.tree.append((self.featureIndex, self.threshold))

    def predict(self, X):
        feature_vals = X[:, self.featureIndex]
        return np.where(feature_vals > self.threshold, 1, 0)

Decision Stump
- this custom decision tree serves as the "weak learner" in Gradient Boosting

maxDepth
- Limits the depth of the decision tree to prevent overfitting

fit() method
- it trains the decision tree using a simple splitting criterion, such as mean squared error (MSE) for regression or Gini impurity for classification (details may vary based on implementation)

prediction
- given input data, predict checks if each value is above the threshold and returns a binary prediction (1 or 0)

In [None]:
class GradientBoostingClassifier:
    def __init__(self, numEstimators=5, learningRate=0.1, maxDepth=3):
        self.numEstimators = numEstimators
        self.learningRate = learningRate
        self.maxDepth = maxDepth
        self.trees = []

    def fit(self, X, y):
        preds = np.full(y.shape, y.mean())

        for i in range(self.numEstimators):
            residuals = y - preds
            print(f"Iteration {i + 1}")
            print("Residuals:", residuals[:5])

            tree = DecisionTreeCustom(maxDepth=self.maxDepth)
            tree.fit(X, residuals)
            self.trees.append(tree)

            tree_preds = tree.predict(X)
            preds += self.learningRate * tree_preds

            print("Updated Predictions:", preds[:5])
            print("-" * 40)

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for tree in self.trees:
            y_pred += self.learningRate * tree.predict(X)
        return np.where(y_pred > 0.5, 1, 0)

n_estimators
- the number of weak learners (decision trees) to use

learning_rate
- controls the contribution of each tree to the final prediction

models
- it just stores the trained weak learners

fit method
- the algorithm starts with the actual target values as the initial residuals
- for each iteration, a weak learner (decision tree) is trained on the residuals
- the predictions from this learner are scaled by the learning rate and subtracted from the residuals to update them
- the trained weak learner is then appended to the list of models

In [None]:
# initialize the model with fewer estimators for quick debugging
model = GradientBoostingClassifier(numEstimators=10, learningRate=0.1, maxDepth=3)
model.fit(X_train.values, y_train.values)

Iteration 1
Residuals: [-0.03972498 -0.03972498 -0.03972498 -0.03972498 -0.03972498]
Updated Predictions: [0.13972498 0.13972498 0.13972498 0.13972498 0.13972498]
----------------------------------------
Iteration 2
Residuals: [-0.13972498 -0.13972498 -0.13972498 -0.13972498 -0.13972498]
Updated Predictions: [0.23972498 0.23972498 0.23972498 0.23972498 0.23972498]
----------------------------------------
Iteration 3
Residuals: [-0.23972498 -0.23972498 -0.23972498 -0.23972498 -0.23972498]
Updated Predictions: [0.33972498 0.33972498 0.33972498 0.33972498 0.33972498]
----------------------------------------
Iteration 4
Residuals: [-0.33972498 -0.33972498 -0.33972498 -0.33972498 -0.33972498]
Updated Predictions: [0.43972498 0.43972498 0.43972498 0.43972498 0.43972498]
----------------------------------------
Iteration 5
Residuals: [-0.43972498 -0.43972498 -0.43972498 -0.43972498 -0.43972498]
Updated Predictions: [0.53972498 0.53972498 0.53972498 0.53972498 0.53972498]
---------------------

Residuals
- Each iteration outputs the first five values of the residuals to illustrate how much error remains after the current prediction.

Updated Predictions
- After updating with the stump’s prediction, it shows the first five updated predictions to help track learning progress over the boosting iterations.

Iteration Information
- The print statement at the beginning of each iteration clarifies which boosting round is currently running.

# DESCRIPTION

gradient boosting is an ensemble technique that iteratively constructs a number of small models, frequently decision stumps, each of which fixes mistakes caused by the models before it. by incorporating new models that concentrate on residual errors, the objective is to reduce the discrepancy between expected and actual results.

at each iteration the algorithm does as follows:

- determines the residuals, or the discrepancies between the current projections and the actual target values.
- fits these residuals to a weak learner (such a decision stump).
- adds the new learner's predictions to the existing ones, scaling them according to the learning rate, a hyperparameter that regulates each learner's contribution.


key concepts:

weak learner
- a simple model (e.g., a shallow decision tree) that performs slightly better than random guessing

residuals
- the difference between the actual values and the predicted values.

learning rate
- a scaling factor that controls how much each new model contributes to the overall prediction.

additive training
- models are trained sequentially, with each new model added to the ensemble to correct previous errors.

# Pseudocode for Gradient Boosting



```
Input: Training data (X, y), number of estimators (n_estimators), learning rate (learning_rate)
Output: A trained Gradient Boosting model

1. Initialize predictions as the mean of target values:
   F_0(x) = mean(y)

2. For each estimator (m = 1 to n_estimators):
   a. Compute the residuals for each data point:
      residual = y - F_(m-1)(x)

   b. Train a weak learner (decision tree) on the residuals:
      tree_m = TrainDecisionTree(X, residual)

   c. Update predictions with the new weak learner's output scaled by the learning rate:
      F_m(x) = F_(m-1)(x) + learning_rate * tree_m.predict(X)

   d. Store the trained tree_m

3. Final prediction for a new data point x':
   F(x') = sum of (learning_rate * tree_m.predict(x')) for all m estimators

```



initialization
- to provide a baseline, begin with a straightforward prediction, such as the target variable's mean

residual calculation
- determine the difference between the current predictions and the actual target values for each iteration

weak learner training
- teach a basic model, or weak learner, to forecast these residuals

update predictions
- adding the new learner's predictions, weighted by a learning rate, will update the forecasts

repeat
- keep going until everything has been added

# Differences Between Gradient Boosting and Random Forests

1. model combination

- with gradient boosting, models are constructed one after the other, with each new model concentrating on the residual errors of the ones that came before it. we refer to this methodical process as "boosting."
- in contrast, random Forests construct each tree separately before averaging their predictions to produce a final outcome. We refer to this parallel strategy as "bagging."

2. prediction strategy

- gradient Boosting generates predictions by adding together all of the weak learners' predictions, each of which is weighted by a learning rate.
- random forests use the majority vote (for classification) or average (for regression) across all trees to generate predictions.

3. complexity and interpretability

- because each weak learner concentrates on fixing certain mistakes from the previous stage, gradient boosting frequently results in a more complex final model. although it needs to be carefully adjusted (e.g. the number of iterations and learning rate), it typically performs better on more difficult issues.
- because Random Forests average independent trees, they are easier to understand and more resilient to overfitting, which allows them to be applied to a greater range of situations without requiring a lot of fine-tuning.

Source:
GeeksforGeeks. (2024, March 6). Gradient Boosting vs Random Forest. GeeksforGeeks. https://www.geeksforgeeks.org/gradient-boosting-vs-random-forest/

‌

in short, random forests are parallel and concentrate on averaging results from independent models, whereas gradient boosting is sequential and modifies each model based on the errors of the previous one.