In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import pandas as pd

In [2]:
# Reading the data from the Pre-processed file
data = pd.read_csv('cleaned_labelEncoded_PCA_adult.csv')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,income
0,0.585612,-0.708951,-0.394252,0.723986,-1.186807,-0.70648,-2.813949,-0.829748,1.108792,-0.571561,0
1,0.666157,-0.794009,0.128526,-0.026741,-1.777371,-1.184097,-1.460062,-0.316101,0.109311,-1.100819,0
2,1.003859,-0.362138,0.025417,-0.306432,0.475741,-0.694615,0.313164,0.254932,-0.710069,0.140799,0
3,0.759383,1.577882,1.518623,-0.400051,-0.745707,-1.069425,0.67821,-0.063624,0.217335,-2.149402,0
4,-2.779713,3.008856,3.935919,0.27565,0.486809,1.121522,-0.688891,0.22554,-0.897165,2.750224,0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'data' is your DataFrame containing features and target
# Drop the target column from features
X = data.drop(columns=['income'])
y = data['income']

# Convert categorical variables into numerical representation (One-Hot Encoding)
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a random forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Parameter grid for GridSearch
parameter_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4] 
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=parameter_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from the GridSearchCV
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_

# Make predictions on the test data
predictions = best_rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, predictions)
print("Classification Report:")
print(report)

# Get feature importance
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': best_rf_model.feature_importances_})
print("Feature Importance:")
print(feature_importance)


Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [None]:
# from sklearn.model_selection import GridSearchCV
# 
# # Parameter grid for GridSearch
# parameter_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30], 
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4] 
# }
# 
# # Grid search with cross-validation
# grid_search = GridSearchCV(estimator=rf_model, param_grid=parameter_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
# grid_search.fit(X_train, y_train)
# 
# # Best parameters from the GridSearchCV
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Construct confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Display confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Random Forest Classifier')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
