In [1]:
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the dataset
X_train, y_train = load_svmlight_file("a9a.txt")
X_test, y_test = load_svmlight_file("a9a.t")

In [5]:
# Convert to dense arrays if they are in sparse format
X_train = X_train.toarray()
X_test = X_test.toarray()

In [7]:
# Splitting the data to create a validation set
X_train_part, X_val, y_train_part, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [9]:
# Parameters and distributions to sample from
param_dist = {
    'n_estimators': [100, 300],
    'bootstrap': [True, False],
    'max_depth': [None, 20],
    'min_samples_leaf': [1, 4],
    'min_impurity_decrease': [0.0, 0.01]
}

In [11]:
# Randomized search on hyperparameters
rf_random = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=32, cv=5, verbose=2, random_state=42, n_jobs=-1,
    return_train_score=True
)

In [13]:
# Find the best hyperparameters
rf_random.fit(X_train_part, y_train_part)
best_rf = rf_random.best_estimator_

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [15]:
# Evaluation on the test set
y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test Error Rate: {1 - test_accuracy:.2f}")
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

Test Accuracy: 0.85
Test Error Rate: 0.15
Test Classification Report:
               precision    recall  f1-score   support

        -1.0       0.88      0.93      0.90     12435
         1.0       0.72      0.58      0.64      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.75      0.77     16281
weighted avg       0.84      0.85      0.84     16281



In [17]:
# Output the training and validation error for the best parameters
best_params = rf_random.best_params_
best_cv_accuracy = rf_random.best_score_
print(f"Best Parameters found: {best_params}")
print(f"Best CV Accuracy: {best_cv_accuracy:.2f}")
print(f"Best CV Error Rate: {1 - best_cv_accuracy:.2f}")

Best Parameters found: {'n_estimators': 100, 'min_samples_leaf': 4, 'min_impurity_decrease': 0.0, 'max_depth': None, 'bootstrap': False}
Best CV Accuracy: 0.84
Best CV Error Rate: 0.16


In [19]:
# Assuming rf_random is a fitted RandomizedSearchCV object
results = rf_random.cv_results_

In [21]:
# Iterate over the results to print the mean train and validation error
for i in range(len(results['params'])):
    # Extract mean training and validation scores, then calculate the errors
    mean_train_score = np.mean([results[f'split{j}_train_score'][i] for j in range(5)])
    mean_test_score = np.mean([results[f'split{j}_test_score'][i] for j in range(5)])
    mean_train_error = 1 - mean_train_score
    mean_val_error = 1 - mean_test_score

    # Print the errors
    print(f"Parameters: {results['params'][i]}")
    print(f"Mean Train Error: {mean_train_error:.2f}")
    print(f"Mean Validation Error: {mean_val_error:.2f}")

Parameters: {'n_estimators': 100, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_depth': None, 'bootstrap': True}
Mean Train Error: 0.04
Mean Validation Error: 0.17
Parameters: {'n_estimators': 300, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_depth': None, 'bootstrap': True}
Mean Train Error: 0.04
Mean Validation Error: 0.17
Parameters: {'n_estimators': 100, 'min_samples_leaf': 4, 'min_impurity_decrease': 0.0, 'max_depth': None, 'bootstrap': True}
Mean Train Error: 0.13
Mean Validation Error: 0.16
Parameters: {'n_estimators': 300, 'min_samples_leaf': 4, 'min_impurity_decrease': 0.0, 'max_depth': None, 'bootstrap': True}
Mean Train Error: 0.13
Mean Validation Error: 0.16
Parameters: {'n_estimators': 100, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.01, 'max_depth': None, 'bootstrap': True}
Mean Train Error: 0.24
Mean Validation Error: 0.24
Parameters: {'n_estimators': 300, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.01, 'max_depth': None, 'bootstrap':