In [1]:
import optuna
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score, cross_validate
import joblib
import warnings

warnings.filterwarnings("ignore")

# Set global random seeds for reproducibility
SEED = 42
random.seed(SEED)

np.random.seed(SEED)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def Rescaling_experiments(data, numeric_cols, scaling_method):
    if scaling_method == 0:  # No Scaling
        pass
    elif scaling_method == 1:  # MaxAbsScaler
        data[numeric_cols] = MaxAbsScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 2:  # StandardScaler
        data[numeric_cols] = StandardScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 3:  # MinMaxScaler
        data[numeric_cols] = MinMaxScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 4:  # RobustScaler
        data[numeric_cols] = RobustScaler().fit_transform(data[numeric_cols])
    elif scaling_method == 5:  # QuantileTransformer (Uniform)
        data[numeric_cols] = QuantileTransformer(output_distribution='uniform', random_state=SEED).fit_transform(data[numeric_cols])
    elif scaling_method == 6:  # QuantileTransformer (Normal)
        data[numeric_cols] = QuantileTransformer(output_distribution='normal', random_state=SEED).fit_transform(data[numeric_cols])
    return data


In [3]:
# Load Dataset
data = pd.read_csv('dataset.csv')

# Specify numeric columns for scaling
numeric_cols = ['age', 'educational-num', 'hours-per-week']

# Apply rescaling (using QuantileTransformer Normal in this example)
scaling_method = 6
processed_data = Rescaling_experiments(data.copy(), numeric_cols, scaling_method)

# Split features and target
x = processed_data.drop(columns=['income'])
y = processed_data['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=SEED)

print("Data processed and split into training and testing ")


Data processed and split into training and testing 


In [5]:
# Define RandomForestClassifier with best parameters
DTC=RandomForestClassifier
best_model = DTC(
    n_estimators=150,         # Number of trees in the forest
    criterion='gini',       # Split quality: 'entropy' for information gain
    max_depth=13,              # Maximum depth of each tree
    min_samples_split=4,      # Minimum number of samples required to split an internal node
    min_samples_leaf=2,        # Minimum number of samples required to be at a leaf node
    max_features='sqrt',       # Number of features to consider for splitting at each node ('sqrt' = square root of features)
    bootstrap=True,            # Whether bootstrap samples are used when building trees
    ccp_alpha=0.0,             # Complexity parameter for pruning (0.0 = no pruning)
    class_weight=None       # Weights associated with classes (None = all classes are weighted equally)
                # Seed for reproducibility
)


best_model.fit(X_train,y_train)
print("Model trained successfully.")


Model trained successfully.


In [6]:
# Predict test set
prediction = best_model.predict(X_test)

# Evaluate accuracy
acc_score = accuracy_score(y_test, prediction)
print(f"Accuracy on Test Set: {acc_score}")


Accuracy on Test Set: 0.8468572988466526


In [8]:
# Save the model
joblib.dump(best_model, 'Random_forest.joblib')
print("Model saved as 'Random_forest.joblib'.")

# Load the model
model = joblib.load('Random_forest.joblib')
print("Model loaded successfully.")


Model saved as 'Random_forest.joblib'.
Model loaded successfully.


In [9]:
# Predict with new data
new_data = [[29.0, 32.0, 10.0, 1, 0.0, 0, 60.0, 0, 0.0, 0.0, 0.0, 
             1.0, -1.0684422957824236, 0.4608239854737111, 0.18642692922290538, 
             -0.35369893417798676, -0.057252251562758275, -0.12196223718457576, 
             0.003853627562193318, 0.7764769793931923, -0.05051009021640369, 
             0.13568970638338268]]

prediction = best_model.predict(new_data)
print(f"Prediction for new data: {prediction[0]}")


Prediction for new data: 1
