In [7]:
#import packages
import json #save preprocessing details
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import opendatasets as od
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects


In [10]:
#import datasets from kaggle
od.download("https://www.kaggle.com/datasets/wenruliu/adult-income-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: pipingure
Your Kaggle Key: ········
Downloading adult-income-dataset.zip to ./adult-income-dataset


100%|█████████████████████████████████████████████████████████| 652k/652k [00:00<00:00, 1.09MB/s]







In [17]:
file = ("adult-income-dataset/\
adult.csv")

In [18]:
df = pd.read_csv(file)

In [19]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [21]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [22]:
#missing values in the data
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [23]:
df["workclass"].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [26]:
df["income"].values

array(['<=50K', '<=50K', '>50K', ..., '<=50K', '<=50K', '>50K'],
      dtype=object)

In [27]:
#set input matrix values
X = df[['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']]

In [28]:
#set the target variable
y = df["income"]

In [29]:
#split the set into training and testing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)

In [30]:
#Random forest doesn't handle missing values 
#fill missing values with mode

train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print(train_mode)

{'age': 33.0, 'workclass': 'Private', 'fnlwgt': 125892, 'education': 'HS-grad', 'educational-num': 9.0, 'marital-status': 'Married-civ-spouse', 'occupation': 'Prof-specialty', 'relationship': 'Husband', 'race': 'White', 'gender': 'Male', 'capital-gain': 0.0, 'capital-loss': 0.0, 'hours-per-week': 40.0, 'native-country': 'United-States'}


In [36]:
#converting categorical columns to numerical

encoders = {}

for column in ['workclass', 'education', 'marital-status',
                'occupation', 'relationship', 'race',
                'gender','native-country']:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert

In [37]:
#train the Random forest model
model = RandomForestClassifier(n_estimators=100)
model = model.fit(X_train,y_train)

In [38]:
#fit the extra trees model
extra = ExtraTreesClassifier(n_estimators = 100)
extra = extra.fit(X_train,y_train)

In [39]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(model, "./random_forest.joblib", compress=True)
joblib.dump(extra, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']