In [1]:
import json  # will be needed for saving preprocessing details
import numpy as np  # for data manipulation
import pandas as pd  # for data manipulation
from sklearn.model_selection import train_test_split  # will be used for data split
from sklearn.preprocessing import LabelEncoder  # for preprocessing
from sklearn.ensemble import RandomForestClassifier  # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier  # for training the algorithm
import joblib  # for saving algorithm and preprocessing objects

In [2]:
# load dataset
df = pd.read_csv(
    'https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv',
    skipinitialspace=True)
x_cols = [c for c in df.columns if c != 'income']

# set input matrix and target column
X = df[x_cols]
y = df['income']

# show first rows of data
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [3]:
# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

In [4]:
# fill missing values
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print(train_mode)

{'age': 36, 'workclass': 'Private', 'fnlwgt': 164190, 'education': 'HS-grad', 'education-num': 9, 'marital-status': 'Married-civ-spouse', 'occupation': 'Craft-repair', 'relationship': 'Husband', 'race': 'White', 'sex': 'Male', 'capital-gain': 0, 'capital-loss': 0, 'hours-per-week': 40, 'native-country': 'United-States'}


In [5]:
# convert categoricals
encoders = {}
for column in [
        'workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'sex', 'native-country'
]:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert

In [6]:
# train the Random Forest algorithm
rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)

In [7]:
# train the Extra Trees algorithm
et = ExtraTreesClassifier()
et = et.fit(X_train, y_train)

In [8]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']