In [1]:
import pandas as pd
import numpy as np

from IPython.display import display

from ml.data import process_data
from ml.model import train_model, compute_model_metrics, inference, calc_slice_performance
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./data/census.csv')
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlgt           32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.describe()

Unnamed: 0,age,fnlgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
df.columns

Index(['age', ' workclass', ' fnlgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' salary'],
      dtype='object')

In [6]:
df.columns = [el.lstrip() for el in df.columns]
print(df.columns)

Index(['age', 'workclass', 'fnlgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')


In [7]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

In [8]:
for col in cat_features:
    df[col] = df[col].apply(lambda x: x.lstrip())

In [9]:
for col in df.columns.tolist():
    if df[col].dtype == 'object':
        print('\n')
        display(df.groupby([col])[col].count())





workclass
?                    1836
Federal-gov           960
Local-gov            2093
Never-worked            7
Private             22696
Self-emp-inc         1116
Self-emp-not-inc     2541
State-gov            1298
Without-pay            14
Name: workclass, dtype: int64





education
10th              933
11th             1175
12th              433
1st-4th           168
5th-6th           333
7th-8th           646
9th               514
Assoc-acdm       1067
Assoc-voc        1382
Bachelors        5355
Doctorate         413
HS-grad         10501
Masters          1723
Preschool          51
Prof-school       576
Some-college     7291
Name: education, dtype: int64





marital-status
Divorced                  4443
Married-AF-spouse           23
Married-civ-spouse       14976
Married-spouse-absent      418
Never-married            10683
Separated                 1025
Widowed                    993
Name: marital-status, dtype: int64





occupation
?                    1843
Adm-clerical         3770
Armed-Forces            9
Craft-repair         4099
Exec-managerial      4066
Farming-fishing       994
Handlers-cleaners    1370
Machine-op-inspct    2002
Other-service        3295
Priv-house-serv       149
Prof-specialty       4140
Protective-serv       649
Sales                3650
Tech-support          928
Transport-moving     1597
Name: occupation, dtype: int64





relationship
Husband           13193
Not-in-family      8305
Other-relative      981
Own-child          5068
Unmarried          3446
Wife               1568
Name: relationship, dtype: int64





race
Amer-Indian-Eskimo      311
Asian-Pac-Islander     1039
Black                  3124
Other                   271
White                 27816
Name: race, dtype: int64





sex
Female    10771
Male      21790
Name: sex, dtype: int64





native-country
?                               583
Cambodia                         19
Canada                          121
China                            75
Columbia                         59
Cuba                             95
Dominican-Republic               70
Ecuador                          28
El-Salvador                     106
England                          90
France                           29
Germany                         137
Greece                           29
Guatemala                        64
Haiti                            44
Holand-Netherlands                1
Honduras                         13
Hong                             20
Hungary                          13
India                           100
Iran                             43
Ireland                          24
Italy                            73
Jamaica                          81
Japan                            62
Laos                             18
Mexico                          643
Nicaragua    





salary
 <=50K    24720
 >50K      7841
Name: salary, dtype: int64

In [10]:
df.to_csv('./data/clean_census.csv', index=False)

In [1]:
import pandas as pd
import numpy as np

from IPython.display import display

from ml.data import process_data
from ml.model import train_model, compute_model_metrics, inference, calc_slice_performance
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./data/clean_census.csv')

In [3]:
train, test = train_test_split(
    df,
    test_size = 0.20,
    random_state = 42
)

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
x_train, y_train, encoder, lb = process_data(
    train,
    categorical_features = cat_features,
    label = "salary",
    training=True
)

x_test, y_test, _, _ = process_data(
    test, 
    categorical_features = cat_features, 
    label = 'salary', 
    training = False,
    encoder = encoder,
    lb = lb
)

# Train and save a model.
model = train_model(x_train, y_train)

In [8]:
type(model)

sklearn.ensemble._forest.RandomForestClassifier

In [4]:
preds = inference(model, x_test)

In [5]:
compute_model_metrics(y_test, preds, display=True)

f-beta: 0.6861313868613139
precision: 0.7557427258805512
recall: 0.6282622533418205


In [6]:
calc_slice_performance(
    data = test,
    model = model,
    cat_features = cat_features,
    encoder = encoder,
    lb = lb
)


Categorical column: workclass	| Slice value: State-gov
f-beta: 0.7285714285714286
precision: 0.7611940298507462
recall: 0.6986301369863014

Categorical column: workclass	| Slice value: Self-emp-not-inc
f-beta: 0.5801526717557252
precision: 0.7238095238095238
recall: 0.4840764331210191

Categorical column: workclass	| Slice value: Federal-gov
f-beta: 0.7647058823529412
precision: 0.7878787878787878
recall: 0.7428571428571429

Categorical column: workclass	| Slice value: Self-emp-inc
f-beta: 0.7619047619047619
precision: 0.7787610619469026
recall: 0.7457627118644068

Categorical column: workclass	| Slice value: ?
f-beta: 0.46875000000000006
precision: 0.6818181818181818
recall: 0.35714285714285715

Categorical column: workclass	| Slice value: Without-pay
f-beta: 1.0
precision: 1.0
recall: 1.0

Categorical column: workclass	| Slice value: Local-gov
f-beta: 0.7298578199052131
precision: 0.7623762376237624
recall: 0.7

Categorical column: workclass	| Slice value: Private
f-beta: 0.68521549