1. Use any binary classification dataset
2. Define validation strategy and use it for all next steps without changes
3. Train decision tree model and estimate performance on validation
4. Train bagging model with decision tree as a base model and estimate performance on validation
5. Write your own bagging implementation:
  <br>5.1. Define init for our CustomBaggingClassifier
  <br>5.2. Write fit as described in lecture: divide train data on n parts (`n_estimators` in CustomBaggingClassifier), train `base_estimator` on each part and save these models inside class
  <br>5.3. For predictions we should use all saved models and combine their predictions (as voting)
6. Compare performance of sklearn bagging model with your own implementation

In [14]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.base import clone
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\train.csv')
test_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\test.csv')


# load train data
# reuse the preprocessing approach from the previous homework


def preprocess_data(train_df, test_df):
    # Зберігаємо статистичні значення з тренувального набору даних
    age_mean = train_df['Age'].mean()
    embarked_mode = train_df['Embarked'].mode()[0]

    # Заповнюємо пропуски у тренувальному та тестовому наборі однаковими значеннями
    train_df['Age'] = train_df['Age'].fillna(age_mean)
    test_df['Age'] = test_df['Age'].fillna(age_mean)

    train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
    test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)

    # Drop unnecessary columns that are not useful for modeling
    train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    test_df = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # Застосовуємо one-hot encoding до обох наборів даних
    train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)
    test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'], drop_first=True)

    return train_df, test_df

# Call this function before splitting data
train_df, test_df = preprocess_data(train_df, test_df)

X_train = train_df.drop('Survived', axis=1)  # Features
y_train = train_df['Survived']  # Target
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, val_index in stratified_kfold.split(train_df, train_df['Survived']):
    X_train_fold, X_val_fold = train_df.iloc[train_index], train_df.iloc[val_index]
    y_train_fold, y_val_fold = train_df['Survived'].iloc[train_index], train_df['Survived'].iloc[val_index]



In [21]:
# define the bagging model (from sklearn)
# define the hyperparameters grid
# define the grid search with cross validation using previously defined validation method
# train the model
# print the best hyperparameters
# print the best score on train and validation data, estimate the generalization error

# Define the base estimator (DecisionTree)
base_estimator = DecisionTreeClassifier(random_state=42)

# Define the BaggingClassifier model using the 'estimator' parameter
bagging_model = BaggingClassifier(estimator=base_estimator, random_state=42)

# Define the hyperparameters grid for tuning
param_grid = {
    'n_estimators': [100],
    'max_samples': [0.5],
    'max_features': [0.75],
    'estimator__max_depth': [10],
    'estimator__min_samples_split': [5],
    'estimator__min_samples_leaf': [2]
}

# Perform GridSearchCV with StratifiedKFold
grid_search = GridSearchCV(estimator=bagging_model, 
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=stratified_kfold,  # From earlier
                           n_jobs=-1, 
                           verbose=1)

# Train the model with the grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Get the best model from the grid search
best_bagging_model = grid_search.best_estimator_

# Evaluate the model on training data (just for reference)
train_predictions = best_bagging_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Training Accuracy: {train_accuracy}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Hyperparameters: {'estimator__max_depth': 10, 'estimator__min_samples_leaf': 2, 'estimator__min_samples_split': 5, 'max_features': 0.75, 'max_samples': 0.5, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.837043238451689
Training Accuracy: 0.8960674157303371


In [16]:
# implement the custom bagging model

class CustomBaggingClassifier:
    def __init__(self, base_estimator, n_estimators=10, max_samples=1.0, max_features=1.0):
        # Initialize with the base estimator, number of estimators, sample size, and feature size
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.estimators_ = []  # Will hold the trained base estimators
        self.sample_indices_ = []  # Store indices of samples used for training

    def fit(self, X_train, y_train):
        n_samples, n_features = X_train.shape
        
        # Calculate the number of samples and features to use for each base estimator
        sample_size = int(self.max_samples * n_samples)
        feature_size = int(self.max_features * n_features)
        
        # Train n_estimators base estimators
        self.estimators_ = []  # Reset list of trained estimators
        for i in range(self.n_estimators):
            # 1. Draw max_samples samples from X_train with replacement
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=True)
            # 2. Draw max_features features from X_train without replacement
            feature_indices = np.random.choice(n_features, size=feature_size, replace=False)
            
            # 3. Train the estimator on the drawn samples and features using .iloc
            X_sampled = X_train.iloc[sample_indices, feature_indices]
            y_sampled = y_train.iloc[sample_indices]
            
            estimator = clone(self.base_estimator)
            estimator.fit(X_sampled, y_sampled)
            
            # 4. Save the trained estimator and its corresponding features
            self.estimators_.append((estimator, feature_indices))

        return self

    def predict(self, X_test):
        # Predict the label for each estimator and take the majority vote
        predictions = np.array([estimator.predict(X_test.iloc[:, features]) for estimator, features in self.estimators_])
        
        # Perform majority voting across all base estimators
        majority_votes = [Counter(preds).most_common(1)[0][0] for preds in predictions.T]
        
        return np.array(majority_votes)

    def predict_proba(self, X_test):
        # Predict the probabilities for each estimator and average the results
        probas = np.array([estimator.predict_proba(X_test.iloc[:, features]) for estimator, features in self.estimators_])
        return np.mean(probas, axis=0)

    def get_params(self, deep=True):
        # Return the parameters of the CustomBaggingClassifier for grid search
        return {
            'base_estimator': self.base_estimator,
            'n_estimators': self.n_estimators,
            'max_samples': self.max_samples,
            'max_features': self.max_features
        }

    def set_params(self, **params):
        # Set the parameters of the CustomBaggingClassifier from a dictionary
        for param, value in params.items():
            setattr(self, param, value)
        return self


# Instantiate CustomBaggingClassifier
custom_bagging = CustomBaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, max_samples=0.8, max_features=0.8)


# Train the custom bagging classifier
custom_bagging.fit(X_train, y_train)

# Make predictions
y_pred = custom_bagging.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.8156424581005587


In [17]:
# Implement the custom bagging model
class CustomBaggingClassifier:
    def __init__(self, base_estimator, n_estimators=10, max_samples=1.0, max_features=1.0):
        # Initialize with the base estimator, number of estimators, sample size, and feature size
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.estimators_ = []  # Will hold the trained base estimators

    def fit(self, X_train, y_train):
        n_samples, n_features = X_train.shape
        
        # Calculate the number of samples and features to use for each base estimator
        sample_size = int(self.max_samples * n_samples)
        feature_size = int(self.max_features * n_features)
        
        # Train n_estimators base estimators
        self.estimators_ = []  # Reset list of trained estimators
        for i in range(self.n_estimators):
            # 1. Draw max_samples samples from X_train with replacement
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=True)
            # 2. Draw max_features features from X_train without replacement
            feature_indices = np.random.choice(n_features, size=feature_size, replace=False)
            
            # 3. Train the estimator on the drawn samples and features using .iloc
            X_sampled = X_train.iloc[sample_indices, feature_indices]
            y_sampled = y_train.iloc[sample_indices]
            
            estimator = clone(self.base_estimator)
            estimator.fit(X_sampled, y_sampled)
            
            # 4. Save the trained estimator and its corresponding features
            self.estimators_.append((estimator, feature_indices))

        return self

    def predict_proba(self, X_test):
        # Predict the probabilities for each estimator and average the results
        probas = np.array([estimator.predict_proba(X_test.iloc[:, features]) for estimator, features in self.estimators_])
        return np.mean(probas, axis=0)

    def predict(self, X_test):
        # Use predict_proba to get probabilities for class 0 and class 1
        avg_probas = self.predict_proba(X_test)
        
        # Return 1 if the probability of class 1 is greater than or equal to 0.5, otherwise return 0
        return (avg_probas[:, 1] >= 0.5).astype(int)

    def get_params(self, deep=True):
        # Return the parameters of the CustomBaggingClassifier for grid search
        return {
            'base_estimator': self.base_estimator,
            'n_estimators': self.n_estimators,
            'max_samples': self.max_samples,
            'max_features': self.max_features
        }

    def set_params(self, **params):
        # Set the parameters of the CustomBaggingClassifier from a dictionary
        for param, value in params.items():
            setattr(self, param, value)
        return self


# Instantiate CustomBaggingClassifier
custom_bagging = CustomBaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, max_samples=0.8, max_features=0.8)

# Train the custom bagging classifier
custom_bagging.fit(X_train, y_train)

# Make predictions
y_pred = custom_bagging.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.7988826815642458


In [18]:
# define the random forest model 
# define the hyperparameters grid
# define the grid search with cross validation using previously defined validation method
# train the model
# print the best hyperparameters
# print the best score on train and validation data, estimate the generalization error

# Load your dataset and preprocess it (assuming preprocess_data function exists)
train_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\train.csv')
test_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\test.csv')
train_df, test_df = preprocess_data(train_df, test_df)

# Define X and y (Features and Target)
X = train_df.drop('Survived', axis=1)  # Features
y = train_df['Survived']  # Target

# Define StratifiedKFold Cross-Validation
n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Define the hyperparameters grid for tuning
param_grid = {
    'n_estimators': [100],
    'max_depth': [None, 10],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'bootstrap': [True]
}

# Define the GridSearchCV with cross-validation using StratifiedKFold
grid_search = GridSearchCV(estimator=rf_model, 
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=stratified_kfold, 
                           n_jobs=-1, 
                           verbose=1)

# Train the model with Grid Search on the training data
grid_search.fit(X, y)

# Get the best hyperparameters and print the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Hyperparameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score}")

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Hyperparameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.8451007469713137


In [19]:
# compare the results of the three models from this homework and with DT from the previous homework
# make a conclusion on which model is better and why
# if your custom implementation is much worse than the sklearn one, try to improve it
print(f"DT Accuracy: {train_accuracy}")

print(f"Custom Bagging Accuracy: {accuracy}")

print(f"Best RF Accuracy: {best_score}")

DT Accuracy: 0.8960674157303371
Custom Bagging Accuracy: 0.7988826815642458
Best RF Accuracy: 0.8451007469713137


In [20]:
# load test data
# do the same preprocessing as for train data

# using retrained models make predictions on the test data for all new three models
# save the predictions to a file
# upload the predictions to Kaggle and make a submission
# report the score you got and compare it with the score you got on the validation data
# make a conclusion on how well the models generalizes