# Adult Income Dataset

Predict whether income exceeds $50K/yr based on census data

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [39]:
df = pd.read_csv('../data/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


**Feature Engineering**

In [41]:
cats = ['workclass', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
conts = ['age', 'fnlwgt', 'capital.gain', 'capital.loss', 'hours.per.week']
target = 'income'

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [43]:
# unite categories ?, never-worked, without-pay
df['workclass'] = df['workclass'].apply(lambda x: x if x not in ['?', 'Without-pay', 'Never-worked'] else 'Without-pay')
print(df['workclass'].value_counts())

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
Without-pay          1857
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Name: workclass, dtype: int64


In [44]:
# unite caregories 'Married-AF-spouse' and 'Married-spouse-absent'
df['marital.status'] = df['marital.status'].apply(lambda x: x if x not in ['Married-AF-spouse', 'Married-spouse-absent'] else 'Married-spouse-absent')
print(df['marital.status'].value_counts())

Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      441
Name: marital.status, dtype: int64


In [45]:
# unite all countries that are not USA
df['native.country'] = df['native.country'].apply(lambda x: x if x == 'United-States' else 'other')
print(df['native.country'].value_counts())

United-States    29170
other             3391
Name: native.country, dtype: int64


In [46]:
# apply category to target
df[target] = df[target].astype('category')

**Change values on DataFrame to its category codes**

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             32561 non-null  int64   
 1   workclass       32561 non-null  object  
 2   fnlwgt          32561 non-null  int64   
 3   education       32561 non-null  object  
 4   education.num   32561 non-null  int64   
 5   marital.status  32561 non-null  object  
 6   occupation      32561 non-null  object  
 7   relationship    32561 non-null  object  
 8   race            32561 non-null  object  
 9   sex             32561 non-null  object  
 10  capital.gain    32561 non-null  int64   
 11  capital.loss    32561 non-null  int64   
 12  hours.per.week  32561 non-null  int64   
 13  native.country  32561 non-null  object  
 14  income          32561 non-null  category
dtypes: category(1), int64(6), object(8)
memory usage: 3.5+ MB


In [48]:
df = df.astype({x: 'category' for x in cats})
for col in cats:
    df[col] = df[col].cat.codes.values

df[target] = df[target].cat.codes.values

In [49]:
df.drop('education', inplace=True, axis=1)
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,6,77053,8,5,0,1,4,0,0,4356,40,0,0
1,82,2,132870,8,5,4,1,4,0,0,4356,18,0,0
2,66,6,186061,9,5,0,4,2,0,0,4356,40,0,0
3,54,2,140359,3,0,7,4,4,0,0,3900,40,0,0
4,41,2,264663,9,4,10,3,4,0,0,3900,40,0,0
5,34,2,216864,8,0,8,4,4,0,0,3770,45,0,0
6,38,2,150601,5,4,1,4,4,1,0,3770,40,0,0
7,74,5,88638,15,3,10,2,4,0,0,3683,20,0,1
8,68,0,422013,8,0,10,1,4,0,0,3683,40,0,0
9,41,2,70037,9,3,3,4,4,1,0,3004,60,1,1


**Steps missing:**
1. Normalize continuous variables
2. Get train, test split dataloaders
3. Prepare embeddings
4. Define model
5. Train and choose hyperparameters
6. Test