In [142]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv(r"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                 names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                         'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 
                         'hours_per_week', 'native_country', 'income_cat'])

In [130]:
# -- save (temporarily) so as to not hit up UCI website on each load
os.mkdir('tmp_data')

df.to_csv('tmp_data/Adult_data.csv')

# -- add to .GITIGNORE

In [131]:
feature_names = [col for col in df.columns if col != 'income_cat']


# -- split target from features (education_num is a recode of education)
X = df[feature_names].drop("education_num", axis=1)
y = df['income_cat']
del df

In [132]:
from sklearn.model_selection import train_test_split

# -- final_weight is similar for people with similar demographics and residing in the same state

# -- break FULL DATA into 10 quantiles (would group by state ... but don't have that data)
# -- ... using full data because this data is actually available prior to sampling/prediction
X['fnlwgt_cat10'] = pd.qcut(X['fnlwgt'], q=10, labels=[f"q{i}" for i in range(1, 11)])

# -- save object
final_weight_cats = X.groupby('fnlwgt_cat10')['fnlwgt'].agg(['min', 'max'])

# -- split train/test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=83458)

In [140]:
# -- fill all missing continuous fields with the median value within that quantile of final_weight

median_values = pd.DataFrame(index = X_train['fnlwgt_cat10'].unique())

for cat in X_train['fnlwgt_cat10'].unique():
    for col in ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']:
        value_to_replace = X_train.loc[X_train['fnlwgt_cat10']==cat][col].median()
        
        # -- save for testing/deployment
        median_values.loc[cat, col] = value_to_replace
        X_train.loc[(X_train[col].isnull()) & (X_train['fnlwgt_cat10']==cat), col] = value_to_replace
        
# -- combine final_weight catgories with median_values (save)
# .to_json(orient='index')
median_values = median_values.merge(final_weight_cats, left_index=True, right_index=True)
median_values = median_values.to_json(orient='index')

In [136]:
from sklearn.preprocessing import LabelEncoder

# -- convert categorical fields into numerical fields
encoders = {}
for col in ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
            'race','sex','native_country', 'fnlwgt_cat10']:
    
    categorical_convert = LabelEncoder()
    X_train[col] = categorical_convert.fit_transform(X_train[col])
    encoders[col] = categorical_convert

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = categorical_convert.fit_transform(X_train[col])


In [137]:
from sklearn.ensemble import RandomForestClassifier

# -- train using random forest model
# -- (each node: take BOOTSTRAP sample and a random selection of features -> find BEST split, continue to depth)
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [138]:
from sklearn.ensemble import ExtraTreesClassifier

# -- train using extra trees model
# -- (each node: use FULL sample and a random selection of features -> take RANDOM split, continue to depth)
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

In [141]:
import joblib

# -- save pre-processed data and trained models
joblib.dump(median_values, "./median_values.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./rf_model.joblib", compress=True)
joblib.dump(et, "./et_model.joblib", compress=True)

['./et_model.joblib']