In [27]:
import optuna
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score, cross_validate
import joblib
import warnings

warnings.filterwarnings("ignore")

# Set global random seeds for reproducibility
SEED = 42
random.seed(SEED)

np.random.seed(SEED)


In [28]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500, step=100),  # Smaller range
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 10, 100),  # Smaller range
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),  # Narrowed range
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 50),  # Narrowed range
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True]),  # Fixed to True to simplify
        'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.01, step=0.001),  # Smaller range
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),  # Limited options
        'random_state': 42  # Fixed to ensure reproducibility
    }

    # Use cross-validation score instead of single train-test split
    clf = RandomForestClassifier(**params, n_jobs=-1)
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    
    # Return mean cross-validation score
    return scores.mean()

In [29]:
def Rescaling_experiments(data, numeric_cols, scaling_method):
    if scaling_method == 0:  # No Scaling
        pass
    elif scaling_method == 1:  # MaxAbsScaler
        data[numeric_cols] = MaxAbsScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 2:  # StandardScaler
        data[numeric_cols] = StandardScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 3:  # MinMaxScaler
        data[numeric_cols] = MinMaxScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 4:  # RobustScaler
        data[numeric_cols] = RobustScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 5:  # QuantileTransformer (Uniform)
        data[numeric_cols] = QuantileTransformer(output_distribution='uniform', random_state=SEED).fit_transform(data[numeric_cols])
    elif scaling_method == 6:  # QuantileTransformer (Normal)
        data[numeric_cols] = QuantileTransformer(output_distribution='normal', random_state=SEED).fit_transform(data[numeric_cols])
    return data


In [25]:
# Load Dataset
data = pd.read_csv('dataset.csv')

# Specify numeric columns for scaling
numeric_cols = ['age', 'educational-num', 'hours-per-week']

# Apply rescaling (using QuantileTransformer Normal in this example)
scaling_method = 6
processed_data = Rescaling_experiments(data.copy(), numeric_cols, scaling_method)

# Split features and target
x = processed_data.drop(columns=['income'])
y = processed_data['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=SEED)

print("Data processed and split into training and testing ")


Data processed and split into training and testing sets.


In [26]:

scaling_methods = {
    0: "No Scaling",
    1: "MaxAbsScaler",
    2: "StandardScaler",
    3: "MinMaxScaler",
    4: "RobustScaler",
    5: "QuantileTransformer (Uniform)",
    6: "QuantileTransformer (Normal)"
}


scaling_method = 0
all_results = []


for Trial in range(100, 700, 100):
    for scaling_method in range(0, 7):
        print(f"Testing Scaling Method: {scaling_methods[scaling_method]}")
        
        processed_data = Rescaling_experiments(data.copy(), numeric_cols, scaling_method)
        x = processed_data.drop(columns=['income'])
        y = processed_data['income']

        # Single train-test split for final evaluation
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=SEED)

        # Hyperparameter optimization with cross-validation
        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
        study.optimize(objective, n_trials=Trial,  show_progress_bar=True)

        # Train final model with best parameters
        best_params = study.best_params
        best_model = RandomForestClassifier(**best_params, n_jobs=-1)
        
        # Perform k-fold cross-validation on the entire training set
        cv_scores = cross_validate(best_model, X_train, y_train, cv=5, 
                         scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'])
        
        # Final evaluation on test set
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        # Calculate metrics
        test_accuracy = accuracy_score(y_test, y_pred)
        test_precision = precision_score(y_test, y_pred, average="weighted")
        test_f1 = f1_score(y_test, y_pred, average="weighted")
        test_recall = recall_score(y_test, y_pred, average="weighted")

        # Store both cross-validation and test results
        all_results.append({
            'Trial no': Trial,
            'Scaling_Method': scaling_methods[scaling_method],
            'CV_Accuracy_Mean': cv_scores['test_accuracy'].mean(),
            'CV_Accuracy_Std': cv_scores['test_accuracy'].std(),
            'Test_Accuracy': test_accuracy,
            'Test_Precision': test_precision,
            'Test_F1': test_f1,
            'Test_Recall': test_recall,
            'Best_Params': best_params
        })

# Convert results to DataFrame
results_df = pd.DataFrame(all_results)
print("\nFinal Results Summary:")
results_df.to_csv('Dataset_3_run2.csv', index=False)
print(results_df)


[I 2025-01-14 20:05:51,667] A new study created in memory with name: no-name-07b96ca7-e309-4703-967f-96d12d06223e


Testing Scaling Method: No Scaling


Best trial: 0. Best value: 0.765538:   1%|          | 1/100 [00:32<52:50, 32.03s/it]

[I 2025-01-14 20:06:23,697] Trial 0 finished with value: 0.7655384866927852 and parameters: {'n_estimators': 600, 'criterion': 'gini', 'max_depth': 64, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'log2', 'bootstrap': True, 'ccp_alpha': 0.006, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7655384866927852.


Best trial: 0. Best value: 0.765538:   2%|▏         | 2/100 [01:14<1:01:57, 37.93s/it]

[I 2025-01-14 20:07:05,766] Trial 1 finished with value: 0.7642224150435176 and parameters: {'n_estimators': 1500, 'criterion': 'gini', 'max_depth': 26, 'min_samples_split': 10, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'bootstrap': True, 'ccp_alpha': 0.003, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7655384866927852.


Best trial: 2. Best value: 0.768464:   3%|▎         | 3/100 [01:27<43:15, 26.76s/it]  

[I 2025-01-14 20:07:19,224] Trial 2 finished with value: 0.7684635024666424 and parameters: {'n_estimators': 500, 'criterion': 'entropy', 'max_depth': 81, 'min_samples_split': 11, 'min_samples_leaf': 26, 'max_features': 'sqrt', 'bootstrap': True, 'ccp_alpha': 0.006, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.7684635024666424.


Best trial: 3. Best value: 0.81807:   4%|▍         | 4/100 [02:01<47:08, 29.46s/it] 

[I 2025-01-14 20:07:52,839] Trial 3 finished with value: 0.8180701431738759 and parameters: {'n_estimators': 1500, 'criterion': 'gini', 'max_depth': 37, 'min_samples_split': 6, 'min_samples_leaf': 35, 'max_features': 'sqrt', 'bootstrap': True, 'ccp_alpha': 0.005, 'class_weight': None}. Best is trial 3 with value: 0.8180701431738759.


Best trial: 3. Best value: 0.81807:   5%|▌         | 5/100 [02:11<35:57, 22.71s/it]

[I 2025-01-14 20:08:03,583] Trial 4 finished with value: 0.7639589021130189 and parameters: {'n_estimators': 400, 'criterion': 'gini', 'max_depth': 57, 'min_samples_split': 28, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': True, 'ccp_alpha': 0.01, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.8180701431738759.


Best trial: 5. Best value: 0.829887:   6%|▌         | 6/100 [02:52<45:16, 28.90s/it]

[I 2025-01-14 20:08:44,489] Trial 5 finished with value: 0.8298867931373015 and parameters: {'n_estimators': 1400, 'criterion': 'entropy', 'max_depth': 14, 'min_samples_split': 17, 'min_samples_leaf': 20, 'max_features': 'log2', 'bootstrap': True, 'ccp_alpha': 0.003, 'class_weight': None}. Best is trial 5 with value: 0.8298867931373015.


Best trial: 5. Best value: 0.829887:   7%|▋         | 7/100 [03:01<34:40, 22.37s/it]

[I 2025-01-14 20:08:53,427] Trial 6 finished with value: 0.8085348106963884 and parameters: {'n_estimators': 300, 'criterion': 'gini', 'max_depth': 99, 'min_samples_split': 39, 'min_samples_leaf': 10, 'max_features': 'log2', 'bootstrap': True, 'ccp_alpha': 0.007, 'class_weight': None}. Best is trial 5 with value: 0.8298867931373015.


Best trial: 5. Best value: 0.829887:   8%|▊         | 8/100 [03:07<26:12, 17.09s/it]

[I 2025-01-14 20:08:59,211] Trial 7 finished with value: 0.7644855814603737 and parameters: {'n_estimators': 200, 'criterion': 'gini', 'max_depth': 88, 'min_samples_split': 32, 'min_samples_leaf': 17, 'max_features': 'log2', 'bootstrap': True, 'ccp_alpha': 0.003, 'class_weight': 'balanced'}. Best is trial 5 with value: 0.8298867931373015.


Best trial: 5. Best value: 0.829887:   9%|▉         | 9/100 [03:47<36:54, 24.33s/it]

[I 2025-01-14 20:09:39,459] Trial 8 finished with value: 0.7644563588098292 and parameters: {'n_estimators': 1400, 'criterion': 'gini', 'max_depth': 74, 'min_samples_split': 39, 'min_samples_leaf': 29, 'max_features': 'sqrt', 'bootstrap': True, 'ccp_alpha': 0.005, 'class_weight': 'balanced'}. Best is trial 5 with value: 0.8298867931373015.


Best trial: 5. Best value: 0.829887:   9%|▉         | 9/100 [03:53<39:23, 25.97s/it]


[W 2025-01-14 20:09:45,398] Trial 9 failed with parameters: {'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 38, 'min_samples_split': 26, 'min_samples_leaf': 46, 'max_features': 'log2', 'bootstrap': True, 'ccp_alpha': 0.008, 'class_weight': 'balanced'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\janbo\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\janbo\AppData\Local\Temp\ipykernel_23276\359978113.py", line 17, in objective
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\janbo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
   

KeyboardInterrupt: 

In [15]:
# Define RandomForestClassifier with best parameters
DTC = RandomForestClassifier
best_model = DTC(
    n_estimators=1400,
    criterion='entropy',
    max_depth=96,
    min_samples_split=50,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    ccp_alpha=0.0,
    class_weight=None,
    random_state=42
)

# Train the model
best_model.fit(X_train, y_train)
print("Model trained successfully.")


Model trained successfully.


In [16]:
# Predict test set
prediction = best_model.predict(X_test)

# Evaluate accuracy
acc_score = accuracy_score(y_test, prediction)
print(f"Accuracy on Test Set: {acc_score}")


Accuracy on Test Set: 0.8495871152664983


In [17]:
# Save the model
joblib.dump(best_model, 'best_random_forest_model.joblib')
print("Model saved as 'best_random_forest_model.joblib'.")

# Load the model
model = joblib.load('best_random_forest_model.joblib')
print("Model loaded successfully.")


Model saved as 'best_random_forest_model.joblib'.
Model loaded successfully.


In [19]:
# Predict with new data
new_data = [[29.0, 32.0, 10.0, 1, 0.0, 0, 60.0, 0, 0.0, 0.0, 0.0, 
             1.0, -1.0684422957824236, 0.4608239854737111, 0.18642692922290538, 
             -0.35369893417798676, -0.057252251562758275, -0.12196223718457576, 
             0.003853627562193318, 0.7764769793931923, -0.05051009021640369, 
             0.13568970638338268]]

prediction = model.predict(new_data)
print(f"Prediction for new data: {prediction[0]}")


Prediction for new data: 0
