In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [2]:
# Load Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split dataset into training (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Training samples: 120
Test samples: 30


In [5]:
class CustomDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        if len(unique_classes) == 1:
            return {'class': unique_classes[0]}
        if num_samples == 0 or (self.max_depth and depth >= self.max_depth):
            return {'class': np.bincount(y).argmax()}

        best_info_gain = -float('inf')
        best_split = None

        for feature_idx in range(num_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask
                left_y = y[left_mask]
                right_y = y[right_mask]

                info_gain = self._information_gain(y, left_y, right_y)
                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_split = {'feature_idx': feature_idx,
                                  'threshold': threshold,
                                  'left_mask': left_mask,
                                  'right_mask': right_mask}

        if best_split is None:
            return {'class': np.bincount(y).argmax()}

        left_tree = self._build_tree(X[best_split['left_mask']], y[best_split['left_mask']], depth+1)
        right_tree = self._build_tree(X[best_split['right_mask']], y[best_split['right_mask']], depth+1)

        return {'feature_idx': best_split['feature_idx'],
                'threshold': best_split['threshold'],
                'left_tree': left_tree,
                'right_tree': right_tree}

    def _information_gain(self, parent, left, right):
        parent_entropy = self._entropy(parent)
        left_entropy = self._entropy(left)
        right_entropy = self._entropy(right)
        weighted_avg_entropy = (len(left)/len(parent))*left_entropy + (len(right)/len(parent))*right_entropy
        return parent_entropy - weighted_avg_entropy

    def _entropy(self, y):
        class_probs = np.bincount(y) / len(y)
        return -np.sum(class_probs * np.log2(class_probs + 1e-9))

    def predict(self, X):
        return [self._predict_single(x, self.tree) for x in X]

    def _predict_single(self, x, tree):
        if 'class' in tree:
            return tree['class']
        feature_val = x[tree['feature_idx']]
        if feature_val <= tree['threshold']:
            return self._predict_single(x, tree['left_tree'])
        else:
            return self._predict_single(x, tree['right_tree'])


In [6]:
custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)

y_pred_custom = custom_tree.predict(X_test)
accuracy_custom = accuracy_score(y_test, y_pred_custom)
print(f"Custom Decision Tree Accuracy: {accuracy_custom:.4f}")


Custom Decision Tree Accuracy: 1.0000


In [7]:
sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)

y_pred_sklearn = sklearn_tree.predict(X_test)
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sklearn:.4f}")


Scikit-learn Decision Tree Accuracy: 1.0000


In [8]:
print(f"Accuracy Comparison:")
print(f"Custom Decision Tree: {accuracy_custom:.4f}")
print(f"Scikit-learn Decision Tree: {accuracy_sklearn:.4f}")


Accuracy Comparison:
Custom Decision Tree: 1.0000
Scikit-learn Decision Tree: 1.0000


**PART 2**

In [9]:
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Load Wine dataset
wine = load_wine()
X_wine = wine.data
y_wine = wine.target

# Split dataset
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_wine, y_wine, test_size=0.2, random_state=42)


In [10]:
# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train_w, y_train_w)
y_pred_dtc = dtc.predict(X_test_w)
f1_dtc = f1_score(y_test_w, y_pred_dtc, average='weighted')
print(f"Decision Tree F1 Score: {f1_dtc:.4f}")

# Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_w, y_train_w)
y_pred_rfc = rfc.predict(X_test_w)
f1_rfc = f1_score(y_test_w, y_pred_rfc, average='weighted')
print(f"Random Forest F1 Score: {f1_rfc:.4f}")


Decision Tree F1 Score: 0.9440
Random Forest F1 Score: 1.0000


In [11]:
param_grid_rfc = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_rfc = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rfc, cv=5, scoring='f1_weighted')
grid_rfc.fit(X_train_w, y_train_w)

print("Best parameters for Random Forest Classifier:", grid_rfc.best_params_)

# Evaluate best model
y_pred_best_rfc = grid_rfc.predict(X_test_w)
f1_best_rfc = f1_score(y_test_w, y_pred_best_rfc, average='weighted')
print(f"Tuned Random Forest F1 Score: {f1_best_rfc:.4f}")


Best parameters for Random Forest Classifier: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Tuned Random Forest F1 Score: 1.0000


In [12]:
# Using wine dataset's first feature as target for demo regression
y_wine_reg = X_wine[:, 0]  # example target
X_wine_reg = X_wine[:, 1:]  # rest as features

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_wine_reg, y_wine_reg, test_size=0.2, random_state=42)

# Decision Tree Regressor
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train_r, y_train_r)
y_pred_dtr = dtr.predict(X_test_r)
mse_dtr = mean_squared_error(y_test_r, y_pred_dtr)
print(f"Decision Tree Regressor MSE: {mse_dtr:.4f}")

# Random Forest Regressor
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train_r, y_train_r)
y_pred_rfr = rfr.predict(X_test_r)
mse_rfr = mean_squared_error(y_test_r, y_pred_rfr)
print(f"Random Forest Regressor MSE: {mse_rfr:.4f}")


Decision Tree Regressor MSE: 0.3120
Random Forest Regressor MSE: 0.1543


In [13]:
param_grid_rfr = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_rfr = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_grid_rfr,
                              cv=5, n_iter=10, scoring='neg_mean_squared_error', random_state=42)
grid_rfr.fit(X_train_r, y_train_r)

print("Best parameters for Random Forest Regressor:", grid_rfr.best_params_)

# Evaluate best model
y_pred_best_rfr = grid_rfr.predict(X_test_r)
mse_best_rfr = mean_squared_error(y_test_r, y_pred_best_rfr)
print(f"Tuned Random Forest Regressor MSE: {mse_best_rfr:.4f}")


Best parameters for Random Forest Regressor: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 5}
Tuned Random Forest Regressor MSE: 0.1481
