<a href="https://colab.research.google.com/github/nupur-sng/AdultIncomeClassifier/blob/master/AdultClassifierModelBuilder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from joblib import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import os
from sklearn.metrics import classification_report, accuracy_score

# Load data
data = pd.read_csv('https://raw.githubusercontent.com/nupur-sng/CentralData/main/adult.csv')
df = data.replace('?', 'NA')

df = df.rename(
columns=
  {
    'education.num': 'education_num',
    'marital.status': 'marital_status',
    'capital.gain': 'capital_gain',
    'capital.loss': 'capital_loss',
    'hours.per.week': 'hours_per_week',
    'native.country': 'native_country'
  }
)

X = df.drop('income', axis=1)
y = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

X.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States


In [2]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
# print(X)
# Label encoding for categorical columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])
    print(label_encoders[col].classes_)

print(label_encoders)
X.head(10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

['Federal-gov' 'Local-gov' 'NA' 'Never-worked' 'Private' 'Self-emp-inc'
 'Self-emp-not-inc' 'State-gov' 'Without-pay']
['10th' '11th' '12th' '1st-4th' '5th-6th' '7th-8th' '9th' 'Assoc-acdm'
 'Assoc-voc' 'Bachelors' 'Doctorate' 'HS-grad' 'Masters' 'Preschool'
 'Prof-school' 'Some-college']
['Divorced' 'Married-AF-spouse' 'Married-civ-spouse'
 'Married-spouse-absent' 'Never-married' 'Separated' 'Widowed']
['Adm-clerical' 'Armed-Forces' 'Craft-repair' 'Exec-managerial'
 'Farming-fishing' 'Handlers-cleaners' 'Machine-op-inspct' 'NA'
 'Other-service' 'Priv-house-serv' 'Prof-specialty' 'Protective-serv'
 'Sales' 'Tech-support' 'Transport-moving']
['Husband' 'Not-in-family' 'Other-relative' 'Own-child' 'Unmarried' 'Wife']
['Amer-Indian-Eskimo' 'Asian-Pac-Islander' 'Black' 'Other' 'White']
['Female' 'Male']
['Cambodia' 'Canada' 'China' 'Columbia' 'Cuba' 'Dominican-Republic'
 'Ecuador' 'El-Salvador' 'England' 'France' 'Germany' 'Greece' 'Guatemala'
 'Haiti' 'Holand-Netherlands' 'Honduras' 'Hong

In [3]:
from sklearn.model_selection import GridSearchCV

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define the parameter grid
param_grid = {
    'rf__n_estimators': [50 ,100, 150],
    'rf__max_depth': [None, 10, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

# Best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Concatenate predictions to the test dataframe
X_test_with_preds = X_test.copy()
X_test_with_preds['actual'] = y_test
X_test_with_preds['predicted'] = y_pred

# Identify misclassifications
misclassified = X_test_with_preds[X_test_with_preds['actual'] != X_test_with_preds['predicted']]

print("Number of misclassifications:", len(misclassified))
print("Misclassified samples:\n", misclassified.head())

# Analyze features where the model fails
misclassified_summary = misclassified.describe(include='all')
print("Summary of misclassified samples:\n", misclassified_summary)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Best Parameters: {'rf__max_depth': None, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
Best Cross-Validation Score: 0.8652102425707169
Number of misclassifications: 903
Misclassified samples:
        age  workclass  fnlwgt  education  education_num  marital_status  \
25487   54          7  137065         10             16               4   
16449   57          5  195835         11              9               2   
24453   37          7  160910         10             16               2   
30140   54          6  156800          9             13               2   
28598   32          4  102986          4              3               2   

       occupation  relationship  race  sex  capital_gain  capital_loss  \
25487           3             1     4    0             0             0   
16449           3             0     4    1             0             0   
24453          10             0     4    1             0             0   
30140           2         

In [4]:
params = {key.split('__')[1]: value for key, value in best_params.items()}
print(params)

# Train on the full dataset with the best parameters
best_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('brf', RandomForestClassifier(**params, random_state=42))
])

# Fit the pipeline on the full dataset
best_pipeline.fit(X, y)

{'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [5]:
model_filename = 'adult_Income_Classifier.pkl'
joblib.dump(best_pipeline, model_filename)
print(f"Pipeline saved to {os.path.abspath(model_filename)}")

Pipeline saved to /workspaces/AdultIncomeClassifier/adult_Income_Classifier.pkl
