In [1]:

import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import QuantileTransformer
import pickle
import joblib
import warnings

warnings.filterwarnings("ignore")

# Set global random seeds for reproducibility
SEED = 42
random.seed(SEED)

np.random.seed(SEED)
quantile_transform = QuantileTransformer(output_distribution='normal', random_state=SEED)



In [2]:
def Rescaling_experiments(data, numeric_cols):
    
    data[numeric_cols] =quantile_transform.fit_transform(data[numeric_cols])

    # Save the fitted QuantileTransformer
    joblib.dump(quantile_transform, 'quantile_transform.joblib')

    print("QuantileTransformer saved successfully.")

    return data,quantile_transform

In [3]:
data = pd.read_csv('dataset.csv')


with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)
data[['age', 'educational-num', 'hours-per-week']] = np.round(scaler.inverse_transform(data[['age', 'educational-num', 'hours-per-week']]))
#print(data.head())
print(data.head(1))
# Specify numeric columns for scaling
numeric_cols = ['age', 'educational-num', 'hours-per-week']

# Apply rescaling (using QuantileTransformer Normal in this example)

#print(data.head())
processed_data,quantile_transform = Rescaling_experiments(data.copy(), numeric_cols)

# Split features and target
x = processed_data.drop(columns=['income'])
y = processed_data['income']
#print(x.head())
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=SEED)

print("Data processed and split into training and testing ")

    age  educational-num  gender  capital-gain  capital-loss  hours-per-week  \
0  25.0              7.0       1           0.0             0            40.0   

   native-country  income  race_Amer-Indian-Eskimo  race_Asian-Pac-Islander  \
0               1       0                      0.0                      0.0   

   ...  pca_component_1  pca_component_2  pca_component_3  pca_component_4  \
0  ...         0.376345          0.05599        -0.039835        -0.044926   

   pca_component_5  pca_component_6  pca_component_7  pca_component_8  \
0        -0.038786         0.003345          0.68041        -0.488017   

   pca_component_9  pca_component_10  
0        -0.429933          0.098483  

[1 rows x 23 columns]
QuantileTransformer saved successfully.
Data processed and split into training and testing 


In [4]:
# Define RandomForestClassifier with best parameters
DTC=RandomForestClassifier
best_model = DTC(
    n_estimators=150,         # Number of trees in the forest
    criterion='gini',       # Split quality: 'entropy' for information gain
    max_depth=13,              # Maximum depth of each tree
    min_samples_split=4,      # Minimum number of samples required to split an internal node
    min_samples_leaf=2,        # Minimum number of samples required to be at a leaf node
    max_features='sqrt',       # Number of features to consider for splitting at each node ('sqrt' = square root of features)
    bootstrap=True,            # Whether bootstrap samples are used when building trees
    ccp_alpha=0.0,             # Complexity parameter for pruning (0.0 = no pruning)
    class_weight=None       # Weights associated with classes (None = all classes are weighted equally)
                # Seed for reproducibility
)


best_model.fit(X_train,y_train)
print("Model trained successfully.")


Model trained successfully.


In [5]:
# Predict test set
prediction = best_model.predict(X_test)
print(prediction)
# Evaluate accuracy
acc_score = accuracy_score(y_test, prediction)
print(f"Accuracy on Test Set: {acc_score}")


[0 0 1 ... 0 0 0]
Accuracy on Test Set: 0.847062035078141


In [6]:
# Save the model
joblib.dump(best_model, 'Random_forest.joblib')
print("Model saved as 'Random_forest.joblib'.")

# Load the model
model = joblib.load('Random_forest.joblib')
print("Model loaded successfully.")


Model saved as 'Random_forest.joblib'.
Model loaded successfully.


In [9]:
# Load model and transformer
model = joblib.load('Random_forest.joblib')
quantile_transform = joblib.load('quantile_transform.joblib')

print("Model and QuantileTransformer loaded successfully.")

# Define new data
new_data = [[
    10.0, 7.0, 1, 100.0, 0, 40.0, 1, 0.0, 0.0, 1.0, 0.0, 0.0, 
    0.3758855297799032, 0.05869566078343598, -0.036472425974281894, 
    -0.044404460532722775, -0.03831725383596621, 0.004434248190865603, 
    0.6667730285930741, -0.5058580616247472, -0.42944474786818143, 
    0.10518273928865046
]]

# Define column names
columns = [
    "age", "educational-num", "gender", "capital-gain", "capital-loss", 
    "hours-per-week", "native-country", "race_Amer-Indian-Eskimo", 
    "race_Asian-Pac-Islander", "race_Black", "race_Other", "race_White", 
    "pca_component_1", "pca_component_2", "pca_component_3", 
    "pca_component_4", "pca_component_5", "pca_component_6", 
    "pca_component_7", "pca_component_8", "pca_component_9", 
    "pca_component_10"
]

# Convert to DataFrame
new_data_df = pd.DataFrame(new_data, columns=columns)

# Apply the saved QuantileTransformer
numeric_cols = ['age', 'educational-num', 'hours-per-week']
new_data_df[numeric_cols] = quantile_transform.transform(new_data_df[numeric_cols])

print("New data after scaling:")

# Use processed_data instead of new_data_df
prediction = model.predict(new_data_df)
print(f"Prediction for new data: {prediction[0]}")

Model and QuantileTransformer loaded successfully.
New data after scaling:
Prediction for new data: 1
