In [62]:
import numpy as np
class CustomDecisionTree:
  # used to initialize the decision tree object
  def __init__(self, max_depth=None):
    """
    Initializes the decision tree with the specified maximum depth.
    Parameters:
    max_depth (int, optional): The maximum depth of the tree. If None, the tree is expanded until all
    leaves are pure or contain fewer than the minimum samples required to split.
    """
    self.max_depth = max_depth
    self.tree = None
  #Trains the decision tree on the given feature matrix X and target labels y
  def fit(self, X, y):
    """
    Trains the decision tree model using the provided training data.
    Parameters:
    X (array-like): Feature matrix (n_samples, n_features) for training the model.
    y (array-like): Target labels (n_samples,) for training the model.
    """
    self.tree = self._build_tree(X, y)
  # Constructs the decision tree recursively by splitting the data at the best feature and threshold
  def _build_tree(self, X, y, depth=0):
    """
    Recursively builds the decision tree by splitting the data based on the best feature and threshold
    .
    Parameters:
    X (array-like): Feature matrix (n_samples, n_features) for splitting.
    y (array-like): Target labels (n_samples,) for splitting.
    depth (int, optional): Current depth of the tree during recursion.
    Returns:
    dict: A dictionary representing the structure of the tree, containing the best feature index,
    threshold, and recursive tree nodes.
    """
    num_samples, num_features = X.shape
    unique_classes = np.unique(y)

    # Stopping conditions: pure node or reached max depth
    if len(unique_classes) == 1:
      return {'class': unique_classes[0]}
    if num_samples == 0 or (self.max_depth and depth >= self.max_depth):
      return {'class': np.bincount(y).argmax()}

    # Find the best split based on Information Gain (using Entropy)
    best_info_gain = -float('inf')
    best_split = None
    for feature_idx in range(num_features):
      thresholds = np.unique(X[:, feature_idx])
      for threshold in thresholds:
        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask
        left_y = y[left_mask]
        right_y = y[right_mask]

        info_gain = self._information_gain(y, left_y, right_y)

        if info_gain > best_info_gain:
          best_info_gain = info_gain
          best_split = {
          'feature_idx': feature_idx,
          'threshold': threshold,
          'left_y': left_y,
          'right_y': right_y,
          }

    if best_split is None:
      return {'class': np.bincount(y).argmax()}

    # Recursively build the left and right subtrees
    left_tree = self._build_tree(X[best_split['left_y']], best_split['left_y'], depth + 1)
    right_tree = self._build_tree(X[best_split['right_y']], best_split['right_y'], depth + 1)
    return {'feature_idx': best_split['feature_idx'], 'threshold': best_split['threshold'],'left_tree': left_tree, 'right_tree': right_tree}
  # Calculates the Information Gain for a split
  def _information_gain(self, parent, left, right):
    """
    Computes the Information Gain between the parent node and the left/right child nodes.
    Parameters:
    parent (array-like): The labels of the parent node.
    left (array-like): The labels of the left child node.
    right (array-like): The labels of the right child node.
    Returns:
    float: The Information Gain of the split.
    """
    parent_entropy = self._entropy(parent)
    left_entropy = self._entropy(left)
    right_entropy = self._entropy(right)

    # Information Gain = Entropy(parent) - (weighted average of left and right entropies)
    weighted_avg_entropy = (len(left) / len(parent)) * left_entropy + (len(right) / len(parent)) * right_entropy
    return parent_entropy - weighted_avg_entropy
  # Calculates the entropy(the impurity & disorder) of a set of labels
  def _entropy(self, y):
    """
    Computes the entropy of a set of labels.
    Parameters:
    y (array-like): The labels for which entropy is calculated.
    Returns:
    float: The entropy of the labels.
    """
    # Calculate the probability of each class
    class_probs = np.bincount(y) / len(y)

    # Compute the entropy using the formula: -sum(p * log2(p))
    return -np.sum(class_probs * np.log2(class_probs + 1e-9)) # Added small epsilon to avoid log(0)
  # Predicts the target labels for a given feature matrix X using the trained decision tree
  def predict(self, X):
    """
    Predicts the target labels for the given test data based on the trained decision tree.
    Parameters:
    X (array-like): Feature matrix (n_samples, n_features) for prediction.
    Returns:
    list: A list of predicted target labels (n_samples,).
    """
    return [self._predict_single(x, self.tree) for x in X]
  # Predicts the target label for a single sample by traversing the tree
  def _predict_single(self, x, tree):
    """
    Recursively predicts the target label for a single sample by traversing the tree.
    Parameters:
    x (array-like): A single feature vector for prediction.
    tree (dict): The current subtree or node to evaluate.
    Returns:
    int: The predicted class label for the sample.
    """
    if 'class' in tree:
      return tree['class']
    feature_val = x[tree['feature_idx']]
    if feature_val <= tree['threshold']:
      return self._predict_single(x, tree['left_tree'])
    else:
      return self._predict_single(x, tree['right_tree'])

Step -2- Load and Split the IRIS Dataset:

In [63]:
# Necessary Imports
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target
# Split into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step -3- Train and Evaluate a Custom Decision Tree:

In [64]:
# Train the custom decision tree
custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)
# Predict on the test set
y_pred_custom = custom_tree.predict(X_test)
# Calculate accuracy
accuracy_custom = accuracy_score(y_test, y_pred_custom)
print(f"Custom Decision Tree Accuracy: {accuracy_custom:.4f}")

Custom Decision Tree Accuracy: 0.8000


Step -4- Train and Evaluate a Scikit Learn Decision Tree:

In [65]:
# Train the Scikit-learn decision tree
sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)
# Predict on the test set
y_pred_sklearn = sklearn_tree.predict(X_test)
# Calculate accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sklearn:.4f}")

Scikit-learn Decision Tree Accuracy: 1.0000


Step -5- Result Comparison:

In [66]:
print(f"Accuracy Comparison:")
print(f"Custom Decision Tree: {accuracy_custom:.4f}")
print(f"Scikit-learn Decision Tree: {accuracy_sklearn:.4f}")

Accuracy Comparison:
Custom Decision Tree: 0.8000
Scikit-learn Decision Tree: 1.0000


3 Ensemble Methods and Hyperparameter Tuning.
1. Implement Classification Models:

In [67]:
# necessary imports
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [68]:
# loading the Iris dataset
winedata = load_wine()
X = winedata.data
y = winedata.target

# splitting into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
# trainig the Scikit-learn decision tree
sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)

# predicting on the test set
y_pred_decision_tree = sklearn_tree.predict(X_test)

In [70]:
# training the Scikit-learn random forest
sklearn_forest = RandomForestClassifier(n_estimators=100, random_state=42)
sklearn_forest.fit(X_train, y_train)

# predicting on the test set
y_pred_random_forest = sklearn_forest.predict(X_test)

In [71]:
# calculating f1 score of decision tree
f1_score_decision_tree = f1_score(y_test, y_pred_decision_tree,average="weighted")

In [72]:
# calculating f1 score of random forest
f1_score_random_forest = f1_score(y_test, y_pred_random_forest,average="weighted")

In [73]:
# comparing f1 score
print(f"F1-score Comparison:")
print(f"Decision Tree Classifier: {f1_score_decision_tree:.4f}")
print(f"Random Forest Classifier: {f1_score_random_forest:.4f}")

F1-score Comparison:
Decision Tree Classifier: 0.9449
Random Forest Classifier: 1.0000


2. Hyperparameter Tuning:

 • Identify three hyperparameters of the Random Forest Classifier.

    1) n_estimators: refers to number of trees in the forest(eg: 50,100,200)

    2) max_depth: Indicates maximum depth of each tree to control complexity(eg: None, 10,20)

    3) min_samples_split: Minimum samples required to split a node (eg: 2,5,10)

 • Perform hyperparameter tuning using GridSearchCV to optimize these parameters.


In [74]:
# import GridSearchCV and RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [75]:
# defining parameter grid for tuning
parameter_grid= {
    'n_estimators': [50, 100, 200],  # number of trees in the forest
    'max_depth': [None, 10, 20, 30], # maximum depth of the trees
    'min_samples_split': [2, 5, 10], # minimum number of samples to split a node
}

In [76]:
# configure GridSearchCV for the Random Forest model
grid_search = GridSearchCV(
    estimator=sklearn_forest,
    param_grid=parameter_grid,
    scoring='f1', # using F1-score to evaluate performance
    cv=5,  # performing  5-fold cross-validation
    verbose=2, #provides intermediate updates during the search
    n_jobs=-1  # uses all processors for faster execution
)

In [77]:
# fitting the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [78]:
 # displaying the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best F1-Score: nan


In [79]:
# selecting the best set of parameters and model estimator
best_hyperparams = grid_search.best_params_
best_model = grid_search.best_estimator_

In [80]:
# printing the best hyperparameters
print(f"Best Hyperparameters: {best_hyperparams}")

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}


In [81]:
# evaluating the best model on the test set
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score with Optimized Hyperparameters: {f1:.2f}")

F1 Score with Optimized Hyperparameters: 1.00


3. Implement Regression Model:

• Train a Decision Tree Regressor and a Random Forest Regressor using scikit-learn.


In [82]:
# necessary imports
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [83]:
# initialize Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)
# training the model
dt_regressor.fit(X_train, y_train)

In [84]:
# making predictions on the test set
dt_predictions = dt_regressor.predict(X_test)

In [85]:
# calculating the Mean Squared Error (MSE)
dt_mse = mean_squared_error(y_test, dt_predictions)
print(f"Decision Tree MSE: {dt_mse:.4f}")

Decision Tree MSE: 0.1667


In [86]:
# initialize Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)
# training the model
rf_regressor.fit(X_train, y_train)

In [87]:
# maing predictions on the test set
rf_predictions = rf_regressor.predict(X_test)

In [88]:
# calculating the Mean Squared Error (MSE)
rf_mse = mean_squared_error(y_test, rf_predictions)
print(f"Random Forest MSE: {rf_mse:.4f}")

Random Forest MSE: 0.0648


In [89]:
# Hyperparameter Tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [90]:
# defining the hyperparameter options
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 8, 10]
}

In [91]:
# performing RandomizedSearchCV
rf_random_search = RandomizedSearchCV(
    estimator=rf_regressor,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [92]:
# fitting the search to the training data
rf_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [93]:
# selecting the best parameters and model
best_params = rf_random_search.best_params_
best_rf_model = rf_random_search.best_estimator_

print(f"Optimal Parameters: {best_params}")

Optimal Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'max_depth': None}


Evaluating the Optimized Model

In [94]:
# making predictions using the optimized model
optimized_predictions = best_rf_model.predict(X_test)

In [95]:
# calculating the Mean Squared Error (MSE)
optimized_mse = mean_squared_error(y_test, optimized_predictions)
print(f"MSE after Hyperparameter Tuning: {optimized_mse:.4f}")

MSE after Hyperparameter Tuning: 0.0625
