In [9]:
import pandas as pd
import optuna


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score, confusion_matrix


In [6]:
X_train = pd.read_csv("data/x_train.csv", header=None)
X_test = pd.read_csv("data/x_text.csv", header=None)
y_train = pd.read_csv("data/y_train.csv", header=None)
y_test = pd.read_csv("data/y_test.csv", header=None)



print("X train ",X_train.shape)
print("X test",X_test.shape)
print("Y_Train", y_train.shape)
print("Y_test", y_test.shape)

X train  (3417, 1000)
X test (375, 1000)
Y_Train (3417, 1)
Y_test (375, 1)


In [7]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 100)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)

    return precision

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(f"Best hyperparameters: {study.best_params}")
print(f"Best precision: {study.best_value}")

[I 2024-11-30 22:38:31,376] A new study created in memory with name: no-name-504d2e40-7954-47e1-a5fb-b21ed85d7cbf


  return fit_method(estimator, *args, **kwargs)
[I 2024-11-30 22:38:43,238] Trial 0 finished with value: 0.8986666666666666 and parameters: {'n_estimators': 73, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 15, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8986666666666666.
  return fit_method(estimator, *args, **kwargs)
[I 2024-11-30 22:38:46,711] Trial 1 finished with value: 0.8986666666666666 and parameters: {'n_estimators': 80, 'max_depth': 32, 'min_samples_split': 3, 'min_samples_leaf': 14, 'max_features': 'log2'}. Best is trial 0 with value: 0.8986666666666666.
  return fit_method(estimator, *args, **kwargs)
[I 2024-11-30 22:43:46,928] Trial 2 finished with value: 0.8986666666666666 and parameters: {'n_estimators': 55, 'max_depth': 27, 'min_samples_split': 15, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 0 with value: 0.8986666666666666.
  return fit_method(estimator, *args, **kwargs)
[I 2024-11-30 22:43:52,828] Trial 3 finished with value: 0

Best hyperparameters: {'n_estimators': 75, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Best precision: 0.9038461538461539


Best hyperparameters: {'n_estimators': 75, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Best precision: 0.9038461538461539

In [8]:
param = {'n_estimators': 75, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
model = RandomForestClassifier(
    **param,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)

  return fit_method(estimator, *args, **kwargs)


In [10]:

def classification_metrics(y_test, y_pred):
    print("="*50)
    print("The accuracy score", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 score:", f1_score(y_test, y_pred))
    print("="*50)
    print(classification_report(y_test, y_pred))
    print("="*50)
    print("confusion Matrix")
    print(confusion_matrix(y_test, y_pred))
    print("="*50)

classification_metrics(y_test, y_pred)

The accuracy score 0.8853333333333333
Precision: 0.9038461538461539
Recall: 0.9762611275964391
F1 score: 0.9386590584878743
              precision    recall  f1-score   support

          -1       0.27      0.08      0.12        38
           1       0.90      0.98      0.94       337

    accuracy                           0.89       375
   macro avg       0.59      0.53      0.53       375
weighted avg       0.84      0.89      0.86       375

confusion Matrix
[[  3  35]
 [  8 329]]
