In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

In [2]:
from sklearn.tree import export_graphviz

# Specify the output file path directly
output_file_path = "iris_tree.dot"

export_graphviz(
    tree_clf,
    out_file=output_file_path,
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)


In [4]:
#Estimating Class Probabilities
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [5]:
#making predictions
tree_clf.predict([[5, 1.5]])

array([1])

In [10]:
#Using decision trees for regression:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
X_r = 2 * np.random.rand(100, 1)
y_r = 4 + 3 * X_r + np.random.randn(100, 1)
tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X_r, y_r)

In [11]:
#Exercise one
from sklearn.datasets import make_moons

# Generate the moons dataset
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)


In [12]:
#Splittin the dataset:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
#Running a grid search to find approprite model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50]
}

dt_classifier = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
best_model_full = best_model.fit(X, y)
full_accuracy = best_model_full.score(X_test, y_test)
print("Test accuracy:", test_accuracy, "------- Full accuracy:", full_accuracy)

Test accuracy: 0.8635 ------- Full accuracy: 0.869


In [29]:
#Growing a forest:


#first obtain some subsets
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=1000, train_size=100, random_state=42)
subsets = []
trees_accuracy = []
trees = []
for train_indices, _ in shuffle_split.split(X_train):
    x_subset = X_train[train_indices]
    y_subset = y_train[train_indices]
    subsets.append(x_subset)
    subset_model = best_model.fit(x_subset, y_subset)
    trees_accuracy.append(subset_model.score(X,y))
    trees.append(subset_model)

In [56]:
predictions = []
for tree in trees: predictions.append(tree.predict(X_test))

In [57]:
import numpy as np
from scipy.stats import mode
majority_vote_predictions, _ = mode(predictions)

In [58]:
majority_vote_predictions = majority_vote_predictions.ravel()

In [59]:
majority_vote_predictions

array([1, 0, 0, ..., 0, 0, 1], dtype=int64)