In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
import graphviz

In [None]:
feature_matrix_train = np.load("/content/drive/MyDrive/mlinmb/2/feature_matrix_train.npy")
label_array_train = np.load("/content/drive/MyDrive/mlinmb/2/label_array_train.npy")

feature_matrix_valid = np.load("/content/drive/MyDrive/mlinmb/2/feature_matrix_valid.npy")
label_array_valid = np.load("/content/drive/MyDrive/mlinmb/2/label_array_valid.npy")

feature_matrix_test = np.load("/content/drive/MyDrive/mlinmb/2/feature_matrix_test.npy")
label_array_test = np.load("/content/drive/MyDrive/mlinmb/2/label_array_test.npy")


1. Use all of the following techniques to train a classifier on the exon-intron data:
a. Decision tree (you may use the one you trained in HW2).
b. Random forest.
c. Extra trees.

In [None]:
# Decision Tree
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(feature_matrix_train, label_array_train)
dt_predictions = dt_clf.predict(feature_matrix_valid)
print("Decision tree accuracy:", accuracy_score(label_array_valid, dt_predictions))
print(classification_report(label_array_valid, dt_predictions))

dot_data = export_graphviz(
    dt_clf,
    out_file=None,
    feature_names=[f"Feature {i}" for i in range(feature_matrix_train.shape[1])],
    class_names=[str(i) for i in set(label_array_train)],
    filled=True,
    rounded=True
)
graph = graphviz.Source(dot_data)
graph.view("decision_tree")

# Random Forest
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(feature_matrix_train, label_array_train)
rf_predictions = rf_clf.predict(feature_matrix_valid)
print("Random forest accuracy:", accuracy_score(label_array_valid, rf_predictions))
print(classification_report(label_array_valid, rf_predictions))

print("\nFeature importance:")
for score, name in zip(rf_clf.feature_importances_, [f"Feature {i}" for i in range(feature_matrix_train.shape[1])]):
    print(f"{name}: {round(score, 2)}")

# Extra Trees
et_clf = ExtraTreesClassifier(random_state=42, n_estimators=100)
et_clf.fit(feature_matrix_train, label_array_train)
et_predictions = et_clf.predict(feature_matrix_valid)
print("Extra trees accuracy:", accuracy_score(label_array_valid, et_predictions))
print(classification_report(label_array_valid, et_predictions))


Decision tree accuracy: 0.8632352216141597
              precision    recall  f1-score   support

         0.0       0.82      0.83      0.83      7912
         1.0       0.89      0.89      0.89     12371

    accuracy                           0.86     20283
   macro avg       0.86      0.86      0.86     20283
weighted avg       0.86      0.86      0.86     20283

Random forest accuracy: 0.916185968545087
              precision    recall  f1-score   support

         0.0       0.90      0.89      0.89      7912
         1.0       0.93      0.94      0.93     12371

    accuracy                           0.92     20283
   macro avg       0.91      0.91      0.91     20283
weighted avg       0.92      0.92      0.92     20283


Feature importance:
Feature 0: 0.01
Feature 1: 0.01
Feature 2: 0.01
Feature 3: 0.01
Feature 4: 0.01
Feature 5: 0.0
Feature 6: 0.04
Feature 7: 0.0
Feature 8: 0.01
Feature 9: 0.03
Feature 10: 0.01
Feature 11: 0.0
Feature 12: 0.01
Feature 13: 0.0
Feature 14: 0.01

2. Experiment with at least three hyper-parameters controlling an ensemble algorithm (e.g.
a random forest) and at least three hyper-parameters controlling component estimators
within an ensemble (decision trees).
a. For decision trees (standing alone or as a component of an ensemble), experiment
with (i) maximum depth, (ii) minimum number of samples for a split, and (iii)
minimum number of samples in a leaf.
b. For random forests and extra-trees, experiment with (i) number of estimators, (ii)
maximum number of features, and (iii) maximum number of samples.
c. Use the validation data to evaluate different hyperparameter combinations.
d. Select the combination of hyperparameters that results in the highest accuracy on
the validation set (this accuracy score should be comparable to the one obtained
on the training set).

In [None]:
# Decision tree
dt_param_grid = {
    "max_depth": [5, 10, None],
    "min_samples_split": [20, 100, 200],
    "min_samples_leaf": [10, 50, 100]
}
dt_clf = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=3, scoring="accuracy")
dt_clf.fit(feature_matrix_train, label_array_train)
dt_best = dt_clf.best_estimator_
dt_predictions = dt_best.predict(feature_matrix_valid)
print("Best decision tree params:", dt_clf.best_params_)
print("Decision tree validation accuracy:", accuracy_score(label_array_valid, dt_predictions))

# Random forest
rf_param_grid = {
    "n_estimators": [50, 100, 200],
    "max_features": ["sqrt", "log2", None],
    "max_samples": [0.8, 0.9, None],
    "bootstrap": [True]
}
rf_clf = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=3, scoring="accuracy")
rf_clf.fit(feature_matrix_train, label_array_train)
rf_best = rf_clf.best_estimator_
rf_predictions = rf_best.predict(feature_matrix_valid)
print("Best Random Forest Params:", rf_clf.best_params_)
print("Random Forest Validation Accuracy:", accuracy_score(label_array_valid, rf_predictions))

# Extra trees
et_param_grid = {
    "n_estimators": [50, 100, 200],
    "max_features": ["sqrt", "log2", None],
    "max_samples": [0.8, 0.9, None],
    "bootstrap": [True]
}
et_clf = GridSearchCV(ExtraTreesClassifier(random_state=42), et_param_grid, cv=3, scoring="accuracy")
et_clf.fit(feature_matrix_train, label_array_train)
et_best = et_clf.best_estimator_
et_predictions = et_best.predict(feature_matrix_valid)
print("Best extra trees params:", et_clf.best_params_)
print("Extra trees validation accuracy:", accuracy_score(label_array_valid, et_predictions))

# Evaluation on test
for name, best_clf in [("Decision tree", dt_best), ("Random forest", rf_best), ("Extra trees", et_best)]:
    test_predictions = best_clf.predict(feature_matrix_test)
    print(f"{name} Test accuracy:", accuracy_score(label_array_test, test_predictions))


Best decision tree params: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 20}
Decision tree validation accuracy: 0.8860622195927624
Best Random Forest Params: {'bootstrap': True, 'max_features': None, 'max_samples': 0.8, 'n_estimators': 200}
Random Forest Validation Accuracy: 0.9174185278311887
Best extra trees params: {'bootstrap': True, 'max_features': None, 'max_samples': None, 'n_estimators': 200}
Extra trees validation accuracy: 0.9180101562885175
Decision tree Test accuracy: 0.889568132518241
Random forest Test accuracy: 0.9205284953658056
Extra trees Test accuracy: 0.9230920922894893


3. Merge the training and the validation data, train and optimize a random forest and extra
trees using the out-of-bag error. Repeat Step 4, except you do not have a separate
validation set. You will use the OOB error to find the best hyper parameter combination.

In [None]:
import time
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid

# Merge training and validation datasets
feature_matrix_train_merge = np.vstack((feature_matrix_train, feature_matrix_valid))
label_array_train_merge = np.hstack((label_array_train, label_array_valid))

bagging_param_grid = {
    "n_estimators": [100, 200, 500],
    "max_samples": [0.5, 0.8, 1.0],
    "max_features": [0.5, 0.8, 1.0],
    "bootstrap": [True],
    "bootstrap_features": [False, True]
}

best_oob_score = 0
best_params = None
best_model = None

# Track progress
start_time = time.time()
total_combinations = len(list(ParameterGrid(bagging_param_grid)))

for idx, params in enumerate(ParameterGrid(bagging_param_grid)):
    print(f"Training model {idx + 1}/{total_combinations} with params: {params}")
    model = BaggingClassifier(
        DecisionTreeClassifier(),
        n_jobs=-1,
        oob_score=True,
        random_state=42,
        **params
    )
    model.fit(feature_matrix_train_merge, label_array_train_merge)

    if model.oob_score_ > best_oob_score:
        best_oob_score = model.oob_score_
        best_params = params
        best_model = model

    print(f"Completed model {idx + 1}/{total_combinations}, OOB Score: {model.oob_score_}")

print("Best OOB params:", best_params)
print("Best OOB score:", best_oob_score)

# Evaluate on the test
y_pred = best_model.predict(feature_matrix_test)
test_accuracy = accuracy_score(label_array_test, y_pred)
print("OOB test accuracy:", test_accuracy)
print(f"Total execution time: {time.time() - start_time:.2f} seconds")


Training model 1/54 with params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100}
Completed model 1/54, OOB Score: 0.9192541139864584
Training model 2/54 with params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 200}
Completed model 2/54, OOB Score: 0.9203168481714399
Training model 3/54 with params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 500}
Completed model 3/54, OOB Score: 0.9206455288472073
Training model 4/54 with params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.8, 'n_estimators': 100}
Completed model 4/54, OOB Score: 0.9193088940990862
Training model 5/54 with params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.8, 'n_estimators': 200}
Completed model 5/54, OOB Score: 0.9203825843065934
Training model 6/54 

4. List the most 10 important k-mers using the random-forest classifier and the extra-trees
classifier. Are they similar? Or completely different? Provide a reason(s) why they
should be similar or different?

In [None]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=200)
rf_clf.fit(feature_matrix_train_merge, label_array_train_merge)

et_clf = ExtraTreesClassifier(random_state=42, n_estimators=200)
et_clf.fit(feature_matrix_train_merge, label_array_train_merge)

# Feature importances
rf_top10 = np.argsort(rf_clf.feature_importances_)[::-1][:10]
et_top10 = np.argsort(et_clf.feature_importances_)[::-1][:10]

print("Random forest top 10 feature indices:", rf_top10)
print("Extra trees top 10 feature indices:", et_top10)

# Compare overlap
overlap = set(rf_top10).intersection(set(et_top10))
print(f"Overlap: {len(overlap)}")
print("In common:", overlap)


Random forest top 10 feature indices: [40 26 38 41 24 37  6 58  9 30]
Extra trees top 10 feature indices: [40 26 38 24  6 37 58 41 30 54]
Overlap: 9
In common: {26, 37, 38, 6, 40, 41, 24, 58, 30}


5. Evaluate the final five model (a decision tree, a random forest, extra trees trained on the
training set and a random forest, and extra trees trained on the training and the
validation sets combined) on the test set.

In [None]:
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(feature_matrix_train, label_array_train)
dt_test_predictions = dt_clf.predict(feature_matrix_test)
print("Decision tree test accuracy:", accuracy_score(label_array_test, dt_test_predictions))

rf_clf_train = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf_train.fit(feature_matrix_train, label_array_train)
rf_test_predictions_train = rf_clf_train.predict(feature_matrix_test)
print("Random forest (train) test accuracy:", accuracy_score(label_array_test, rf_test_predictions_train))

et_clf_train = ExtraTreesClassifier(random_state=42, n_estimators=100)
et_clf_train.fit(feature_matrix_train, label_array_train)
et_test_predictions_train = et_clf_train.predict(feature_matrix_test)
print("Extra trees (train) test accuracy:", accuracy_score(label_array_test, et_test_predictions_train))

rf_clf_combined = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf_combined.fit(feature_matrix_train_merge, label_array_train_merge)
rf_test_predictions_combined = rf_clf_combined.predict(feature_matrix_test)
print("Random forest (train + validation) test accuracy:", accuracy_score(label_array_test, rf_test_predictions_combined))

et_clf_combined = ExtraTreesClassifier(random_state=42, n_estimators=100)
et_clf_combined.fit(feature_matrix_train_merge, label_array_train_merge)
et_test_predictions_combined = et_clf_combined.predict(feature_matrix_test)
print("Extra trees (train + validation) test accuracy:", accuracy_score(label_array_test, et_test_predictions_combined))


Decision tree test accuracy: 0.8671859593768487
Random forest (train) test accuracy: 0.9217116939459673
Extra trees (train) test accuracy: 0.9197396963123644
Random forest (train + validation) test accuracy: 0.9230920922894893
Extra trees (train + validation) test accuracy: 0.919443896667324



*   First I loaded the data and trained basic random tree, random forest and exta tree and printed their metrics and feature importance for random forest as it was shown in an example in a textbook.

*   Then, using gridsearch, I experimented with hyperparameter for each model. Random tree achieved the best accuracy and the best hyperparameters were chosen simirlarly for random forest and random tree, the difference was in number of max_samples.

*  Then after merging train and validation sets I run BaggingClassifier to establish the OOB error for the differnet combination of parameters. Every run achieved accuracy of more than 90% and differences were insignificant.


*   Then I compared top 10 most important features and found out that 9/10 are the same for random forest and extra trees.


*   Finally I trained and evaluated 5 models on train or train + validation sets and the best accuracy was achieved by random forest on mixed sets and visible the lowest by decision tree. That confirms that ensemble learning is beneficiial for prediction accuacy.







---



Knowledge:
1. Read the slides about ensemble methods up to feature importance.
2. Answer the following questions in the same Jupyter notebook that includes the
classifiers described above, each in one cell:

a. If you have trained five different models on the exact same training data, and
they all achieve 95% precision, is there any chance that you can combine these
models to get becer results? If so, how? If not, why?

b. What is the difference between hard and soft voting classifiers?

c. Is it possible to speed up training of a bagging ensemble by distributing it across
multiple servers? What about pasting ensembles, boosting ensembles, or random
forests?

d. What is the benefit of out-of-bag evaluation?

e. What makes extra-trees ensembles more random than regular random forests?
How can this extra randomness help? Are extra-trees classifiers slower or faster
than regular random forests?

a. Yes, by combining their predictions using stacking or voting it is possible to gest better results, becouse of the wisdom of the crow effect.

b. Hard voting - the class taht gets the most votes is the predicted one

Soft voting - the final prediction is made by averaging probabilities provided by each classifier

c. Yes, it is possible to be done in prallel, because they are sampled independantly.

d. We dont need a separate validation set.

e. They use random tresholds for every feature rather than search for the best one. It is done to minimize the impurity of the split. I think it is faster, because they dont need to optimize the split so much as the regular trees.