In [1]:
# 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import randint

# 2. Load processed data
data_path = "../data/processed/movie_metadata_processed.csv"
df = pd.read_csv(data_path, encoding='latin1')

# 3. Split features and target
X = df.drop(columns=['is_good'])
y = df['is_good']

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 5. Random Forest base
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# 6. Hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(range(5, 21)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# 7. Randomized search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,          # number of random combinations to try
    cv=5,               # 5-fold cross-validation
    scoring='accuracy',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# 8. Fit the search
random_search.fit(X_train, y_train)

# 9. Best model
best_rf = random_search.best_estimator_
print("Best parameters:", random_search.best_params_)

# 10. Predict and evaluate
y_pred = best_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 11. Feature importance
importances = pd.Series(best_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 10 Feature Importances:\n", importances.head(10))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=   4.9s
[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=   5.0s
[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=   4.4s
[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=   4.5s
[CV] END max_depth=14, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=199; total time=   1.0s
[CV] END max_depth=14, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=199; total time=   1.0s
[CV] END max_depth=14, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=199; total time=   1.3s
[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=6, n_estimators=20

40 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 1358, in wrapper
    estimator._validate_params()
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_c

Best parameters: {'max_depth': 11, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 408}
Accuracy: 0.819623389494549

Classification Report:
               precision    recall  f1-score   support

       False       0.84      0.90      0.87       653
        True       0.78      0.68      0.73       356

    accuracy                           0.82      1009
   macro avg       0.81      0.79      0.80      1009
weighted avg       0.82      0.82      0.82      1009


Confusion Matrix:
 [[586  67]
 [115 241]]

Top 10 Feature Importances:
 num_voted_users           0.300517
duration                  0.098199
title_year                0.094122
genre_Drama               0.091384
budget                    0.090419
gross                     0.049888
num_user_for_reviews      0.046883
genre_Documentary         0.046277
num_critic_for_reviews    0.042971
language_English          0.017172
dtype: float64
