In [1]:
import os
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Load pre-vectorized data
x_path = os.path.join("..","data", "X_tfidf_all.pkl")
X_tfidf = joblib.load(x_path)

In [3]:
y_path = os.path.join("..","data", "y_all.pkl")
y = joblib.load(y_path)

In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, random_state=1311
)

In [5]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20, 50],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}

In [6]:
# Create and run GridSearchCV
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=3, n_jobs=-1, scoring='f1_macro', verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:
# Output best parameters
print("Best Parameters:\n", grid_search.best_params_)
print("\nBest Score:", grid_search.best_score_)