In [35]:
import random
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# Load the generated dataset
df = pd.read_csv("MyData_updated.csv") 

# Separate features and target
X = df.drop(columns=['smoking','hearing(left)', 'Cholesterol', 'ALT', 'eyesight(left)','hearing(right)', 'dental caries'])  # drop all except(waist, hemoglobin, weight, serum creatinine)
y = df['smoking']



In [36]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [37]:
# Convert X and y to numpy arrays for clarity
X_scaled = np.array(X_scaled)
y = np.array(y)


In [38]:
# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.4, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Verify shapes
print("Training set size:", X_scaled.shape)
print("Validation set size:", X_valid.shape)
print("Test set size:", X_test.shape)


Training set size: (159256, 4)
Validation set size: (31851, 4)
Test set size: (31852, 4)


In [39]:
# Display the first few rows of the processed datasets
print("First few rows of the training set:")
print(X_train[:3])

print("\nFirst few rows of the validation set:")
print(X_valid[:3])

print("\nFirst few rows of the test set:")
print(X_test[:3])

First few rows of the training set:
[[-1.45145365e+00 -6.26718756e-01 -9.64842581e-01 -5.17239273e-01]
 [-2.22136427e-04 -9.76073844e-01 -1.36210438e+00 -1.63240593e+00]
 [ 7.81210215e-01 -9.76073844e-01  2.26942811e-01  1.15551071e+00]]

First few rows of the validation set:
[[-0.80398113  1.6091538  -0.17031899  0.59792738]
 [-0.10069201  0.63095956  0.22694281  1.15551071]
 [-1.17237066 -0.27736367 -0.96484258 -0.51723927]]

First few rows of the test set:
[[-0.22348852  1.88863787 -0.17031899  0.04034405]
 [ 0.33467744  0.56108854  0.62420461  1.15551071]
 [-0.67002129  0.77070159 -0.17031899  0.59792738]]


In [40]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

class BaggingClassifier:
    def __init__(self, base_estimators=None, n_estimators=100, random_state=None):
        """
        Bagging ensemble classifier.

        Parameters:
        - base_estimators: List of base models to use for bagging (default: [DecisionTreeClassifier()]).
        - n_estimators: Number of estimators/models in the ensemble.
        - random_state: Random seed for reproducibility.
        """
        self.base_estimators = base_estimators or [DecisionTreeClassifier()]
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models = []

    def fit(self, X, y):
        """
        Train the bagging classifier by fitting multiple base estimators on bootstrapped samples.
        """
        np.random.seed(self.random_state)
        self.models = []
        n_estimators_per_model = self.n_estimators // len(self.base_estimators)
        
        for base_estimator in self.base_estimators:
            for _ in range(n_estimators_per_model):
                # Create a bootstrap sample
                indices = np.random.choice(len(X), size=len(X), replace=True)
                X_bootstrap = X[indices]
                y_bootstrap = y[indices]
                
                # Train a new base model on the bootstrap sample
                model = clone(base_estimator)
                model.fit(X_bootstrap, y_bootstrap)
                self.models.append(model)

    def predict(self, X):
        """
        Predict class labels for the input data by majority voting.
        """
        # Collect predictions from each model
        predictions = np.array([model.predict(X) for model in self.models])
        
        # Perform majority voting
        majority_vote = mode(predictions, axis=0).mode.flatten()
        return majority_vote, predictions


# Example usage:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate synthetic data for testing
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# Define a list of diverse base classifiers
base_estimators = [
    DecisionTreeClassifier(),
    LogisticRegression(),
    SVC(probability=True),  # Ensure SVC supports probability estimation
    KNeighborsClassifier(),
    RandomForestClassifier()
]

# Initialize and train the Bagging ensemble
bagging_model = BaggingClassifier(base_estimators=base_estimators, n_estimators=100, random_state=42)
bagging_model.fit(X_train, y_train)

# Make predictions
majority_vote, predictions = bagging_model.predict(X_valid)
print("Predictions shape (n_estimators, n_samples):", predictions.shape)
print("Majority vote shape (n_samples,):", majority_vote.shape)

# Evaluate the model
accuracy = accuracy_score(y_valid, majority_vote)
print("Bagging Accuracy:", accuracy)


Predictions shape (n_estimators, n_samples): (100, 300)
Majority vote shape (n_samples,): (300,)
Bagging Accuracy: 0.8566666666666667


In [43]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

class AdaBoostClassifier:
    def __init__(self, base_estimators=None, n_estimators=100, random_state=None):
        """
        AdaBoost ensemble classifier.

        Parameters:
        - base_estimators: List of base models to use for boosting.
        - n_estimators: Total number of models in the ensemble.
        - random_state: Random seed for reproducibility.
        """
        self.base_estimators = base_estimators or [DecisionTreeClassifier(max_depth=1, random_state=random_state)]
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models = []
        self.alphas = []

    def fit(self, X, y):
        """
        Train the AdaBoost classifier using weighted training samples.
        """
        np.random.seed(self.random_state)
        n_samples = len(X)
        weights = np.ones(n_samples) / n_samples
        self.models = []
        self.alphas = []

        n_estimators_per_model = self.n_estimators // len(self.base_estimators)

        for base_estimator in self.base_estimators:
            for _ in range(n_estimators_per_model):
                # Clone the base estimator
                model = clone(base_estimator)
                
                # Check if the model supports `sample_weight`
                if hasattr(model, "fit") and "sample_weight" in model.fit.__code__.co_varnames:
                    model.fit(X, y, sample_weight=weights)
                else:
                    # Resample the dataset based on the weights manually
                    indices = np.random.choice(n_samples, size=n_samples, replace=True, p=weights)
                    X_resampled, y_resampled = X[indices], y[indices]
                    model.fit(X_resampled, y_resampled)
                
                # Predict on the full dataset
                y_pred = model.predict(X)

                # Calculate error and alpha
                incorrect = (y_pred != y)
                error = np.dot(weights, incorrect) / np.sum(weights)

                # Avoid division by zero or invalid alpha calculation
                if error >= 1.0:
                    continue
                alpha = 0.5 * np.log((1 - error) / error) if error > 0 else 1.0
                self.alphas.append(alpha)

                # Update weights
                weights *= np.exp(-alpha * y * y_pred)
                weights /= np.sum(weights)

                # Store the model
                self.models.append(model)

    def predict(self, X):
        """
        Predict class labels using weighted voting.
        """
        model_preds = np.array([model.predict(X) for model in self.models])
        weighted_preds = np.zeros(model_preds.shape[1])

        for i in range(len(self.models)):
            weighted_preds += self.alphas[i] * model_preds[i]

        return np.sign(weighted_preds)


# Example usage:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
y = np.where(y == 0, -1, 1)  # Convert labels to -1 and 1 for AdaBoost compatibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define diverse base classifiers
base_estimators = [
    DecisionTreeClassifier(max_depth=1, random_state=42),
    LogisticRegression(max_iter=500, random_state=42),
    SVC(kernel="linear", probability=True, random_state=42),
    KNeighborsClassifier(n_neighbors=5),
]

# Train the AdaBoost ensemble
adaboost_model = AdaBoostClassifier(base_estimators=base_estimators, n_estimators=50, random_state=42)
adaboost_model.fit(X_train, y_train)

# Make predictions
y_pred_adaboost = adaboost_model.predict(X_test)
print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_adaboost))


AdaBoost Accuracy: 0.7666666666666667


In [44]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

class RandomForestClassifier:
    def __init__(self, base_estimators=None, n_estimators=100, max_features='sqrt', random_state=None):
        """
        Random Forest classifier that can use multiple base estimators.
        
        Parameters:
        - base_estimators: List of base models to use for ensemble (e.g., [DecisionTree, LogisticRegression]).
        - n_estimators: Total number of models to train.
        - max_features: The number of features to use for each model. Options: 'sqrt', 'log2', or an integer.
        - random_state: Random seed for reproducibility.
        """
        self.base_estimators = base_estimators or [DecisionTreeClassifier(random_state=random_state)]
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.random_state = random_state
        self.models = []

    def fit(self, X, y):
        """
        Train the RandomForest classifier using bootstrap sampling and feature selection.
        """
        np.random.seed(self.random_state)
        self.models = []
        
        n_samples, n_features = X.shape
        n_estimators_per_model = self.n_estimators // len(self.base_estimators)

        for base_estimator in self.base_estimators:
            for _ in range(n_estimators_per_model):
                # Bootstrap sampling
                indices = np.random.choice(n_samples, size=n_samples, replace=True)
                X_bootstrap = X[indices]
                y_bootstrap = y[indices]
                
                # Feature subset selection
                if self.max_features == 'sqrt':
                    max_features = int(np.sqrt(n_features))
                elif self.max_features == 'log2':
                    max_features = int(np.log2(n_features))
                elif isinstance(self.max_features, int):
                    max_features = self.max_features
                else:
                    max_features = n_features

                features = np.random.choice(n_features, size=max_features, replace=False)
                X_bootstrap = X_bootstrap[:, features]
                
                # Train a model on the bootstrap sample with a random subset of features
                model = clone(base_estimator)
                model.fit(X_bootstrap, y_bootstrap)
                self.models.append((model, features))

    def predict(self, X):
        """
        Predict class labels using majority voting.
        """
        predictions = np.zeros((len(self.models), len(X)))
        for i, (model, features) in enumerate(self.models):
            X_subset = X[:, features]
            predictions[i, :] = model.predict(X_subset)
        
        # Majority vote (for classification)
        return np.round(np.mean(predictions, axis=0)).astype(int)

# Example usage:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define multiple base classifiers
base_estimators = [
    DecisionTreeClassifier(max_depth=1, random_state=42),
    LogisticRegression(max_iter=500, random_state=42),
    KNeighborsClassifier(n_neighbors=5)
]

# Train the Random Forest ensemble
rf_model = RandomForestClassifier(base_estimators=base_estimators, n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.82


In [45]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 15]
}

# Grid Search for Random Forest
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters from Grid Search:", grid_search.best_params_)

# Randomized Search
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 20]
}

random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions=param_dist, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

print("Best Parameters from Randomized Search:", random_search.best_params_)


Best Parameters from Grid Search: {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 150}
Best Parameters from Randomized Search: {'n_estimators': 150, 'min_samples_split': 2, 'max_depth': 30}


In [46]:
# Final model evaluation on the test set
best_model = grid_search.best_estimator_  # or random_search.best_estimator_
y_pred_final = best_model.predict(X_test)

# Final accuracy and classification report
from sklearn.metrics import classification_report

print("Final Model Accuracy:", accuracy_score(y_test, y_pred_final))
print("Classification Report:\n", classification_report(y_test, y_pred_final))


Final Model Accuracy: 0.8566666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86       145
           1       0.89      0.82      0.86       155

    accuracy                           0.86       300
   macro avg       0.86      0.86      0.86       300
weighted avg       0.86      0.86      0.86       300

