# 1. Implement Decision tree

In [1]:
import numpy as np

In [2]:
class Node():
    """
    A node in the decision tree
    """
    def __init__(self, feature = None, threshold = None, left = None, right = None, gain = None, value = None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value
        

In [3]:
class DecisionTree():
    """
    A binary decision tree classifier
    """
    def __init__(self, min_samples = 2, max_depth = 2):
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.selected_feature = []
        
    def set_params(self, **params):
        """
        Set the parameters for the Decision Tree
        """
        for param, value in params.items():
            setattr(self, param, value)

    def split_data(self, dataset, feature, threshold):
        """
        Split the data based on the feature and threshold
        """
        left_dataset, right_dataset = [], []
        for row in dataset:
            if row[feature] <= threshold:
                left_dataset.append(row)
            else:
                right_dataset.append(row)
        left_dataset = np.array(left_dataset)
        right_dataset = np.array(right_dataset)
        return left_dataset, right_dataset

    def entropy(self, y):
        """
        Calculate the entropy of a dataset
        """
        entropy = 0
        labels = np.unique(y)
        for label in labels:
            p_i = len(y[y == label]) / len(y) 
            entropy += -p_i * np.log2(p_i)
        return entropy
    
    def information_gain(self, parent, left, right):
        """
        Calculate the information gain
        """
        information_gain = 0
        parent_entropy = self.entropy(parent)
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        entropy_left, entropy_right = self.entropy(left), self.entropy(right)
        information_gain = parent_entropy - (weight_left * entropy_left + weight_right * entropy_right)
        return information_gain

    def best_split(self, dataset, num_samples, num_features):
        """
        Find the best split
        """
        #dictionary to store the best split
        best_split = {'gain': 0, 'feature': None, 'threshold': None}
        #loop for all features
        for feature_index in range(num_features):
            feature_value = dataset[:, feature_index]
            unique_values = np.unique(feature_value)
            # Iterate through all unique values of feature column i and
            # calculate the information gain
            for threshold in unique_values:
                # Divide dataset depending on if the feature value of X at index feature_index
                # meets the threshold
                left_dataset, right_dataset = self.split_data(dataset, feature_index, threshold)
                if len(left_dataset) and len(right_dataset):
                    y, left_y, right_y = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]
                    information_gain = self.information_gain(y, left_y, right_y)
                    if information_gain > best_split['gain']:
                        best_split['gain'] = information_gain
                        best_split['feature'] = feature_index
                        best_split['threshold'] = threshold
                        best_split['left_dataset'] = left_dataset
                        best_split['right_dataset'] = right_dataset
        return best_split
    
    def calculate_leaf_value(self, y):
        """
        Calculate the value of a leaf node
        """
        y = list(y)
        #priority to the majority class
        most_occuring_value = max(y, key=y.count)
        return most_occuring_value
    
    def build_tree(self, dataset, current_depth = 0):
        """
        Recursively build the tree
        """
        X, y = dataset[:, :-1], dataset[:, -1]
        n_samples, n_features = X.shape
        if n_samples >= self.min_samples and current_depth <= self.max_depth:
            best_split = self.best_split(dataset, n_samples, n_features)
            if best_split['gain']:
                left_node = self.build_tree(best_split['left_dataset'], current_depth + 1)
                right_node = self.build_tree(best_split['right_dataset'], current_depth + 1)
                return Node(best_split['feature'], best_split['threshold'], left_node, right_node, best_split['gain'])
        #compute leaf node value
        leaf_value = self.calculate_leaf_value(y)
        return Node(value=leaf_value)
    
    def fit(self, X, y):
        """
        Fit the decision tree to the training data
        """
        y = y.to_numpy().reshape(-1, 1)
        dataset = np.concatenate((X, y), axis = 1)
        self.root = self.build_tree(dataset)

    def predict(self, X):
        """
        Predict the class labels for each instance in X
        """
        predictions = []
        for index, row in X.iterrows():
            prediction = self.make_prediction(row, self.root)
            predictions.append(prediction)
        return np.array(predictions)
    
    def make_prediction(self, x, node):
        """
        Predict the target value for the given feature vector
        """
        if node.value != None:
            return node.value
        else:
            feature = x.iloc[node.feature]
            if feature <= node.threshold:
                return self.make_prediction(x, node.left)
            else:
                return self.make_prediction(x, node.right)

In [4]:
#Evaluation
def accuracy(y_true, y_pred):
    """
    Function to calculate accuracy
    """
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

# 2. Random forest

In [5]:
class RandomForest:
    """
    A Random Forest classifier
    """
    def __init__(self, n_trees = 100, min_samples = 2, max_depth = 2, max_features = 'sqrt', bootstraps = True):
        self.bootstraps = bootstraps
        self.n_trees = n_trees
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.trees = []
        self.max_features = max_features
    
    def set_params(self, **params):
        """
        Set the parameters for the Random Forest
        """
        for param, value in params.items():
            setattr(self, param, value)

    def bootstrapping(self, X, y, n_samples):
        """
        Create a bootstrap sample
        """
        bootstrap_indices = np.random.choice(np.arange(n_samples), n_samples, replace = True)
        return X.iloc[bootstrap_indices], y.iloc[bootstrap_indices]
    
    def fit(self, X, y):
        """
        Fit the Random Forest to the training data
        """
        n_features = X.shape[1]
        max_features_mapping = {
            None: lambda x: int(np.sqrt(x)),
            'log2': lambda x: int(np.log2(x)),
            'sqrt': lambda x: int(np.sqrt(x)),
            'number': lambda x, f: int(f * x)
        }
        if self.max_features != None and np.isreal(self.max_features):
            self.max_features = max_features_mapping['number'](n_features, float(self.max_features))
        else:
            self.max_features = max_features_mapping[self.max_features](n_features)

        for i in range(self.n_trees):
            if self.bootstraps:
                X_bootstrapped, y_bootstrapped = self.bootstrapping(X, y, len(X))
            else:
                X_bootstrapped, y_bootstrapped = X, y
            #Randomly select features
            selected_features = np.random.choice(np.arange(n_features), self.max_features, replace = False)
            X_bootstrapped = X_bootstrapped.iloc[:, selected_features]
            tree = DecisionTree(min_samples = self.min_samples, max_depth = self.max_depth)
            tree.fit(X_bootstrapped, y_bootstrapped)
            tree.selected_features = selected_features #Assign the selected features to the root node
            self.trees.append(tree)

    def predict(self, X):
        """
        Predict the class labels for each instance in X
        """
        predictions = []
        for tree in self.trees:
            #Use only the selected features for prediction
            selected_features = tree.selected_features
            X_subset = X.iloc[:, selected_features]
            predictions.append(tree.predict(X_subset))
        predictions = np.array(predictions)
        predictions = predictions.astype(int)
        majority_votes = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis = 0, arr = predictions)
        return majority_votes

# 3. Hyperparameters tuning

In [6]:
# Use wandb to compare model with different hyperparameters
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmc0c0z[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
class GridSearchCV:
    """
    Grid search to find the best hyperparameters
    """
    def __init__(self, estimator, param_grid, cv = 3):
        self.estimator = estimator
        self.param_grid = param_grid
        self.cv = cv
    
    def split_data(self, X, y, i):
        """
        Split the data into training and validating sets
        """
        n = len(X)
        test_indices = list(range(i * n // self.cv, (i + 1) * n // self.cv))
        train_indices = list(set(range(n)) - set(test_indices))
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
        return X_train, y_train, X_test, y_test
    
    def accuracy(self, y_true, y_pred):
        """
        Function to calculate accuracy
        """
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    # Hàm đệ quy để tạo ra tất cả các tổ hợp
    def generate_combinations(self, param_values, index=0, combination=[]):
        if index == len(param_values):
            yield tuple(combination)
        else:
            for value in param_values[index]:
                new_combination = combination + [value]
                yield from self.generate_combinations(param_values, index + 1, new_combination)

    def fit(self, X, y):
        """
        Fit the grid search to the training data
        """
        best_score = 0
        self.entries = []
        
        # Generate all combinations of hyperparameters
        combinations = self.generate_combinations(list(self.param_grid.values()))
        
        for combination in combinations:
            print(combination)
            params = {key: value for key, value in zip(self.param_grid.keys(), combination)}
            # Initialize wandb run
            run = wandb.init(
                project=self.estimator.__class__.__name__,
                config={**params}
            )

            scores = []
            for i in range(self.cv):
                X_train, y_train, X_test, y_test = self.split_data(X, y, i)
                # Set hyperparameters for the estimator
                self.estimator.set_params(**params)
                self.estimator.fit(X_train, y_train)
                y_pred = self.estimator.predict(X_test)
                score = self.accuracy(y_test, y_pred)
                scores.append(score)

            mean_score = np.mean(scores)
            if mean_score > best_score:
                best_score = mean_score
                best_params = {**params}

            # Log mean accuracy for current hyperparameters
            wandb.log({"accuracy": mean_score})
            self.entries.append({"estimator_name": self.estimator.__class__.__name__,
                                **params,
                                "accuracy": mean_score
                            })
                            
        # Finish the current run
        wandb.finish()
        self.best_params = best_params
        self.best_score = best_score
        self.estimator.set_params(**best_params)
        self.estimator.fit(X, y)
        return self.estimator


# 4. Conduct experiment on winequality dataset

In [8]:
#Conduct experiments on the dataset
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('winequality-red.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Decision Tree Hyperparameters Tuning

In [10]:
dt_base_model = DecisionTree()
dt_base_model.fit(X_train, y_train)
dt_base_model_pred = dt_base_model.predict(X_test)

dt_model = DecisionTree()
param_grid = {
    'min_samples': [2, 5, 10],
    'max_depth': [1, 2, 5, 10, 50],
}
grid_search = GridSearchCV(dt_model, param_grid, cv = 5)
dt_best_model = grid_search.fit(X_train, y_train)
dt_best_model_pred = dt_best_model.predict(X_test)

result_dt = pd.DataFrame(grid_search.entries)

base_accuracy = accuracy(dt_base_model_pred, y_test)
best_accuracy = accuracy(dt_best_model_pred, y_test)
print(f'Base model accuracy: {base_accuracy:.2f}')
print(f'Best model accuracy: {best_accuracy:.2f}')
print(f'Improvement of {100 * (best_accuracy - base_accuracy) / base_accuracy:.2f}%')

(2, 1)


(2, 2)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.559


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(2, 5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.5707


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(2, 10)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.59652


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

(2, 50)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.58399


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(5, 1)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.59338


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(5, 2)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.559


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(5, 5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.5707


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(5, 10)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.59652


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(5, 50)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.57383


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

(10, 1)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.58477


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(10, 2)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.559


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(10, 5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.5707


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(10, 10)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.59496


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(10, 50)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.55429


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

VBox(children=(Label(value='0.001 MB of 0.008 MB uploaded\r'), FloatProgress(value=0.13118841267956785, max=1.…

0,1
accuracy,▁

0,1
accuracy,0.55508


Base model accuracy: 0.55
Best model accuracy: 0.61
Improvement of 10.86%


## Random Forest Hyperparameters Tuning

In [11]:
rfr_base_model = RandomForest()
rfr_base_model.fit(X_train, y_train)
rfr_base_model_pred = rfr_base_model.predict(X_test)

rfr_model = RandomForest()
param_grid = {
    'bootstrap': [True, False],
    'n_trees': [2, 100],
    'min_samples': [2, 5],
    'max_depth': [2, 10],
    'max_features': ['log2', 'sqrt', 0.5]
}
grid_search = GridSearchCV(rfr_model, param_grid, cv=5)
rfr_best_model = grid_search.fit(X_train, y_train)
rfr_best_model_pred = rfr_best_model.predict(X_test)

result_rfs = pd.DataFrame(grid_search.entries)

base_accuracy = accuracy(rfr_base_model_pred, y_test)
best_accuracy = accuracy(rfr_best_model_pred, y_test)
print(f'Base model accuracy: {base_accuracy:.2f}')
print(f'Best model accuracy: {best_accuracy:.2f}')
print(f'Improvement of {100 * (best_accuracy - base_accuracy) / base_accuracy:.2f}%')

(True, 2, 2, 2, 'log2')


(True, 2, 2, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.58478


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 2, 2, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.58949


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(True, 2, 2, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.6059


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 2, 2, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.64028


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 2, 2, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.69423


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 2, 5, 2, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.73335


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

(True, 2, 5, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.73727


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 2, 5, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.72554


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 2, 5, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.70598


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 2, 5, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.71457


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(True, 2, 5, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.73179


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(True, 100, 2, 2, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75056


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 100, 2, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.63877


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 100, 2, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.62312


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 100, 2, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.61765


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(True, 100, 2, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.64733


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(True, 100, 2, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.7091


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

(True, 100, 5, 2, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75055


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(True, 100, 5, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75525


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(True, 100, 5, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.73492


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(True, 100, 5, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.71539


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(True, 100, 5, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.72475


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(True, 100, 5, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.73882


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 2, 2, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75524


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 2, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 2, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(False, 2, 2, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 2, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 2, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 5, 2, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76306


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(False, 2, 5, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 5, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 2, 5, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

(False, 2, 5, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(False, 2, 5, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(False, 100, 2, 2, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.76228


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(False, 100, 2, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75759


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

(False, 100, 2, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.74274


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 100, 2, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.73414


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 100, 2, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.73492


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 100, 2, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.74899


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888278356, max=1.0…

(False, 100, 5, 2, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75681


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

(False, 100, 5, 2, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75915


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

(False, 100, 5, 2, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.7529


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 100, 5, 10, 'log2')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.74196


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 100, 5, 10, 'sqrt')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.74742


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

(False, 100, 5, 10, 0.5)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75368


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888957186, max=1.0)…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁

0,1
accuracy,0.75837


Base model accuracy: 0.53
Best model accuracy: 0.61
Improvement of 15.48%


# 5. Experimental results of compared models with different hyper-parameters

In [12]:
result_dt.sort_values(by='accuracy', ascending=False)

Unnamed: 0,estimator_name,min_samples,max_depth,accuracy
2,DecisionTree,2,5,0.596523
7,DecisionTree,5,5,0.596523
12,DecisionTree,10,5,0.594957
4,DecisionTree,2,50,0.593376
9,DecisionTree,5,50,0.584767
3,DecisionTree,2,10,0.583992
8,DecisionTree,5,10,0.573827
1,DecisionTree,2,2,0.570695
6,DecisionTree,5,2,0.570695
11,DecisionTree,10,2,0.570695


In [13]:
result_rfs.sort_values(by='accuracy', ascending=False)

Unnamed: 0,estimator_name,bootstrap,n_trees,min_samples,max_depth,max_features,accuracy
29,RandomForest,False,2,2,10,0.5,0.763061
24,RandomForest,False,2,2,2,log2,0.762279
25,RandomForest,False,2,2,2,sqrt,0.762279
35,RandomForest,False,2,5,10,0.5,0.762279
34,RandomForest,False,2,5,10,sqrt,0.762279
33,RandomForest,False,2,5,10,log2,0.762279
32,RandomForest,False,2,5,2,0.5,0.762279
31,RandomForest,False,2,5,2,sqrt,0.762279
30,RandomForest,False,2,5,2,log2,0.762279
28,RandomForest,False,2,2,10,sqrt,0.762279
