## Boruta Feature Selection

In [None]:
data = downsampled_data.copy()

In [None]:
y = data["encoded_type"]
X = data.drop(columns=["encoded_type", "Length", "Duration", "Severity", "TrafficJamNum", "HectometerStart", "HectometerEnd"] + list(data.select_dtypes(include=['object', 'datetime64', 'category']).columns))

In [None]:
X.isna().sum()

In [None]:
for col in X.columns:
   X[f"shadow_{col}"] = X[col].sample(frac=1).reset_index(drop=True)
X.tail()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearnex import patch_sklearn
patch_sklearn()

def get_important_features(X, y):
    # Initiliaze Random Forest CLassifier
    rf = RandomForestClassifier(max_depth=20)
    
    # Fit Random Forest on provided data
    rf.fit(X,y)
    
    # Create dictionary of feature importances
    importances = {feature_name: f_importance for feature_name, f_importance in zip(X.columns, rf.feature_importances_)}
    
    # Isolate importances of Shadow features
    only_shadow_feat_importance = {key:value for key,value in importances.items() if "shadow" in key}
    
    # get importance level of most important shadow feature
    highest_shadow_feature = list(dict(sorted(only_shadow_feat_importance.items(), key=lambda item: item[1], reverse=True)).values())[0]
    
    # get original feature which fulfill boruta selection criteria
    selected_features = [key for key, value in importances.items() if value > highest_shadow_feature]
    
    
    return selected_features

In [None]:
import tqdm
patch_sklearn()

TRIALS = 50
feature_hits = {i:0 for i in data.columns}
for _ in tqdm.tqdm(range(TRIALS)): 
    imp_features = get_important_features(X, y)
    for key, _ in feature_hits.items(): 
        if key in imp_features: feature_hits[key] += 1

In [None]:
feature_hits

### Re-running the grid search on the RF

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [],
    'max_depth': [],
    'min_samples_split': [],
    'min_samples_leaf': [],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy}")