<a href="https://colab.research.google.com/github/nupur-sng/AdultIncomeClassifier/blob/master/AdultClassifierModelBuilder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from joblib import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import os
from sklearn.metrics import classification_report, accuracy_score

# Load data
data = pd.read_csv('https://raw.githubusercontent.com/nupur-sng/CentralData/main/adult.csv')
df = data.replace('?', 'NA')

df = df.rename(
columns=
  {
    'education.num': 'education_num',
    'marital.status': 'marital_status',
    'capital.gain': 'capital_gain',
    'capital.loss': 'capital_loss',
    'hours.per.week': 'hours_per_week',
    'native.country': 'native_country'
  }
)

X = df.drop('income', axis=1)
y = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

X.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States


In [17]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
# print(X)
# Label encoding for categorical columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])
    print(label_encoders[col].classes_)

print(label_encoders)
X.head(10)

['Federal-gov' 'Local-gov' 'NA' 'Never-worked' 'Private' 'Self-emp-inc'
 'Self-emp-not-inc' 'State-gov' 'Without-pay']
['10th' '11th' '12th' '1st-4th' '5th-6th' '7th-8th' '9th' 'Assoc-acdm'
 'Assoc-voc' 'Bachelors' 'Doctorate' 'HS-grad' 'Masters' 'Preschool'
 'Prof-school' 'Some-college']
['Divorced' 'Married-AF-spouse' 'Married-civ-spouse'
 'Married-spouse-absent' 'Never-married' 'Separated' 'Widowed']
['Adm-clerical' 'Armed-Forces' 'Craft-repair' 'Exec-managerial'
 'Farming-fishing' 'Handlers-cleaners' 'Machine-op-inspct' 'NA'
 'Other-service' 'Priv-house-serv' 'Prof-specialty' 'Protective-serv'
 'Sales' 'Tech-support' 'Transport-moving']
['Husband' 'Not-in-family' 'Other-relative' 'Own-child' 'Unmarried' 'Wife']
['Amer-Indian-Eskimo' 'Asian-Pac-Islander' 'Black' 'Other' 'White']
['Female' 'Male']
['Cambodia' 'Canada' 'China' 'Columbia' 'Cuba' 'Dominican-Republic'
 'Ecuador' 'El-Salvador' 'England' 'France' 'Germany' 'Greece' 'Guatemala'
 'Haiti' 'Holand-Netherlands' 'Honduras' 'Hong

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,90,2,77053,11,9,6,7,1,4,0,0,4356,40,39
1,82,4,132870,11,9,6,3,1,4,0,0,4356,18,39
2,66,2,186061,15,10,6,7,4,2,0,0,4356,40,39
3,54,4,140359,5,4,0,6,4,4,0,0,3900,40,39
4,41,4,264663,15,10,5,10,3,4,0,0,3900,40,39
5,34,4,216864,11,9,0,8,4,4,0,0,3770,45,39
6,38,4,150601,0,6,5,0,4,4,1,0,3770,40,39
7,74,7,88638,10,16,4,10,2,4,0,0,3683,20,39
8,68,0,422013,11,9,0,10,1,4,0,0,3683,40,39
9,41,4,70037,15,10,4,2,4,4,1,0,3004,60,26


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42, stratify=y)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

model_filename = 'adult_Income_Classifier.pkl'
joblib.dump(pipeline, model_filename)
print(f"Pipeline saved to {os.path.abspath(model_filename)}")

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1237
           1       0.76      0.58      0.66       392

    accuracy                           0.85      1629
   macro avg       0.82      0.76      0.78      1629
weighted avg       0.85      0.85      0.85      1629

Accuracy: 0.85451197053407
Pipeline saved to /content/adult_Income_Classifier.pkl
