### Determine if an individual makes >$50k based on Census data
Use the Census income dataset from UC Irvine Machine Learning Repository to determine if an individual makes more than $50k from a set of continous and categorical features

In [54]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

### Explore data

In [55]:
df = pd.read_csv('../Data/income.csv')
df.head()

Unnamed: 0,age,sex,education,education-num,marital-status,workclass,occupation,hours-per-week,income,label
0,27,Male,HS-grad,9,Never-married,Private,Craft-repair,40,<=50K,0
1,47,Male,Masters,14,Married,Local-gov,Exec-managerial,50,>50K,1
2,59,Male,HS-grad,9,Divorced,Self-emp,Prof-specialty,20,<=50K,0
3,38,Female,Prof-school,15,Never-married,Federal-gov,Prof-specialty,57,>50K,1
4,64,Female,11th,7,Widowed,Private,Farming-fishing,40,<=50K,0


In [56]:
# How many vals in each target class?
df['label'].value_counts()
# Looks evenly distributed enough

0    21700
1     8300
Name: label, dtype: int64

In [57]:
# Explore relationship between education and education-num cols
ed = df['education']
ed_num = df['education-num']
unique_combos = list(set(zip(ed,ed_num)))
unique_combos.sort(key=lambda x:x[1])
unique_combos
# It looks like we have a 1:1 mapping between the two, so we can ignore one of them

[('5th-6th', 3),
 ('7th-8th', 4),
 ('9th', 5),
 ('10th', 6),
 ('11th', 7),
 ('12th', 8),
 ('HS-grad', 9),
 ('Some-college', 10),
 ('Assoc-voc', 11),
 ('Assoc-acdm', 12),
 ('Bachelors', 13),
 ('Masters', 14),
 ('Prof-school', 15),
 ('Doctorate', 16)]

In [58]:
# How many occupations are in the data?
df['occupation'].unique()
# Not too many where we'd need to feature engineer or consider consolidating

array(['Craft-repair', 'Exec-managerial', 'Prof-specialty',
       'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
       'Adm-clerical', 'Other-service', 'Sales', 'Tech-support',
       'Protective-serv', 'Transport-moving'], dtype=object)

In [59]:
# How many ages?
df['age'].nunique()
# Lots of ages, so we will treat this as continuous

72

In [60]:
# No real need to do feature engineering, so we can move on to separating categorical and continuous feats

### Split categorical and continuous cols

In [61]:
df.head()

Unnamed: 0,age,sex,education,education-num,marital-status,workclass,occupation,hours-per-week,income,label
0,27,Male,HS-grad,9,Never-married,Private,Craft-repair,40,<=50K,0
1,47,Male,Masters,14,Married,Local-gov,Exec-managerial,50,>50K,1
2,59,Male,HS-grad,9,Divorced,Self-emp,Prof-specialty,20,<=50K,0
3,38,Female,Prof-school,15,Never-married,Federal-gov,Prof-specialty,57,>50K,1
4,64,Female,11th,7,Widowed,Private,Farming-fishing,40,<=50K,0


In [62]:
target = 'label'
cat_cols = ['sex', 'education-num', 'marital-status', 'workclass', 'occupation']
cont_cols = ['age', 'hours-per-week']

# Change categorical cols to category dtype
for col in cat_cols:
    df[col] = df[col].astype('category')

# Turn categorical cols ino numpy arrays. Assign #s to each category
cats = np.stack([df[col].cat.codes.values for col in cat_cols], axis=1)

# Convert to PyTorch tensor
cats = torch.tensor(cats, dtype=torch.int64)

# Do the same with the continuous cols
conts = np.stack([df[col].values for col in cont_cols], axis=1)
conts = torch.tensor(conts, dtype=torch.float)

# Reshape target col to be columnar
target = torch.tensor(df[target], dtype=torch.int64).reshape(-1,1)

### Create embeddings for categorical cols

In [63]:
# Determine # of categories in each categorical feature
cat_szs = [len(df[col].cat.categories) for col in cat_cols]

# Embeddings per category = min(50, sz/2) for each categorical col
emb_szs  = [(size, min(50, (size+1)//2)) for size in cat_szs]
print(f'[(# of categories, # embeddings), ...] = {emb_szs}')

[(# of categories, # embeddings), ...] = [(2, 1), (14, 7), (6, 3), (5, 3), (12, 6)]


In [64]:
# Pass cat cols and # of embeddings per to the embedding module
embeds = [nn.Embedding(n_cats, n_embs) for n_cats,n_embs in emb_szs]
print(f'Embeddings = {embeds}')

Embeddings = [Embedding(2, 1), Embedding(14, 7), Embedding(6, 3), Embedding(5, 3), Embedding(12, 6)]


In [65]:
# Create a ModuleList from embeddings
embeds_mod_list = nn.ModuleList(embeds)
print(f'Embeddings ModuleList:\n{embeds_mod_list}')

# The embeddings apply weights to categories. These weights are updated during .backward, allowing the model to learn. See the ny_taxis nb for an example of how the embeddings look for a subset of categorical col categories.

Embeddings ModuleList:
ModuleList(
  (0): Embedding(2, 1)
  (1): Embedding(14, 7)
  (2): Embedding(6, 3)
  (3): Embedding(5, 3)
  (4): Embedding(12, 6)
)


### Define Tabular model

In [89]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        """Initialize the tabular model

        :param emb_szs: List of tuples mapping # of categories and # of embeddings per categorical col
        :type emb_szs: list
        :param n_cont: # of continuous cols
        :type n_cont: int
        :param out_sz: Output size
        :type out_sz: int
        :param layers: Layer size(s)
        :type layers: list
        :param p: Probability of dropout layer, defaults to 0.5
        :type p: float, optional
        """
        
        # Inherit from nn.Module class
        super().__init__()

        # Create embeddings for categorical cols
        self.embeds = nn.ModuleList([nn.Embedding(n_cats, n_embs) for n_cats,n_embs in emb_szs])
        # Create dropout layer - probability embedding(s) will be dropped
        self.emb_drop = nn.Dropout(p)
        # Normalize continuous data
        self.bn_cont = nn.BatchNorm1d(n_cont)

        # Establish sequence of NN layers. Each layer will include a linear fn, an activation fn (ReLU), a normalization step and a dropout layer
        layerlist = []
        n_embs = sum([n_embs for n_cats,n_embs in emb_szs])
        n_in = n_embs + n_cont

        # Iteratively build network based on layers param
        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i

        # Define output linear layer
        layerlist.append(nn.Linear(layers[-1], 2))

        # Combine list of layers
        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):
        """Define how the model will move forward

        :param x_cat: Categorical features
        :type x_cat: torch.tensor
        :param x_cont: Continuous features
        :type x_cont: torch.tensor
        """
        embeddings = []

        # Track embeddings separately for each categorical column, and each value within the column
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))

        # Combine all embeddings into a single tensor per row, tracking all categorical col embeddings
        x = torch.cat(embeddings,1)

        # Add continuous feats to the mix
        x_cont = self.bn_cont(x_cont)  # Normalize cont feats
        x = torch.cat([x,x_cont], 1)
        x = self.layers(x)
        return x

### Create and train model

In [None]:
model = TabularModel()