# ANN for Classification

I want to use embedding model to handel categorical features for a binary classification problem. see <a href='https://www.kaggle.com/c/cat-in-the-dat/'>Kaggle competition</a>.

In [1]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../Data/cat.csv')
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [3]:
# Clean data
df = df.drop('id', axis=1)

cats = list(df.columns.values)
cats.remove('target')

for cat in cats:
    df[cat] = df[cat].astype('category')
    
# This will set embedding sizes for Hours, AMvsPM and Weekdays
cat_szs = [len(df[col].cat.categories) for col in cats]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
print(emb_szs)

[(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (3, 2), (6, 3), (6, 3), (6, 3), (4, 2), (222, 50), (522, 50), (1220, 50), (2215, 50), (11981, 50), (3, 2), (5, 3), (6, 3), (15, 8), (26, 13), (192, 50), (7, 4), (12, 6)]


## Select features with low cardiality

In [4]:
drops =[]
for col, emb in zip(cats, emb_szs):
    if emb[1]>10:
        drops.append(col)
print(drops)
cats = [x for x in cats if x not in drops]
print(cats)
        
# This will set embedding sizes for Hours, AMvsPM and Weekdays
cat_szs = [len(df[col].cat.categories) for col in cats]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
print(emb_szs)

['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_4', 'ord_5']
['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'day', 'month']
[(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (3, 2), (6, 3), (6, 3), (6, 3), (4, 2), (3, 2), (5, 3), (6, 3), (15, 8), (7, 4), (12, 6)]


In [5]:
for cat in cats:
    df[cat] = df[cat].cat.codes.values

In [6]:
np.mean(df.target)

0.30588

In [7]:
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, dt, target, cats=None, IDs=None, size=None, seed=123):
        'Initialization'
        if IDs is None:
            np.random.seed(seed)
            self.ids = list(np.where(np.random.rand(dt.shape[0]) < size)[0])
            print('len of dataset: '+str(len(self.ids)))
        else:
            self.ids = IDs
        
        self.target = torch.from_numpy(dt[target][self.ids].values.astype(np.int64)).flatten() #.reshape(-1,1))
        dt = dt.drop(target, axis=1)
        if cats is not None:
            self.cats  = torch.from_numpy(dt[cats].iloc[self.ids].values.astype(np.int64))
            dt = dt.drop(cats, axis=1)
        if dt.shape[0] > 0:
            self.data  = torch.from_numpy(dt.iloc[self.ids].values.astype(np.float32))
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.ids)
    
    def __getitem__(self, index):
        'Generates one sample of data'
#        print(index, self.data[index])

        return self.data[index], self.cats[index], self.target[index]

In [8]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x = self.layers(x)
        return x

In [9]:
# Parameters
params = {'batch_size': 10000,
          'shuffle': False,
          'num_workers': 6}

np.random.seed(123)
size = 0.5
rand = np.random.rand(df.shape[0])
train_ids = list(np.where(rand < size)[0])
test_ids = list(np.where(rand >= size)[0])

print(len(train_ids), len(test_ids))

training_set = Dataset(df[cats+['target']], 'target', cats=cats, IDs=train_ids)
training_generator = data.DataLoader(training_set, **params)

testing_set = Dataset(df[cats+['target']], 'target', cats=cats, IDs=test_ids)
testing_generator = data.DataLoader(training_set, **params)

149760 150240


In [10]:
torch.manual_seed(33)
model = TabularModel(emb_szs, 2, [10], p=0.4)
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(2, 1)
    (1): Embedding(2, 1)
    (2): Embedding(2, 1)
    (3): Embedding(2, 1)
    (4): Embedding(2, 1)
    (5): Embedding(3, 2)
    (6): Embedding(6, 3)
    (7): Embedding(6, 3)
    (8): Embedding(6, 3)
    (9): Embedding(4, 2)
    (10): Embedding(3, 2)
    (11): Embedding(5, 3)
    (12): Embedding(6, 3)
    (13): Embedding(15, 8)
    (14): Embedding(7, 4)
    (15): Embedding(12, 6)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=44, out_features=10, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=10, out_features=2, bias=True)
  )
)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [12]:
import time
from sklearn import metrics
start_time = time.time()

epochs = 100
losses1 = []
losses2 = []

for i in range(epochs):
    i+=1
    
    for continuous, categoricals, target in training_generator:
        y_pred = model(categoricals)
        loss = torch.sqrt(criterion(y_pred, target))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    
    with torch.set_grad_enabled(False):
        y_pred = model(training_set.cats)
        loss = torch.sqrt(criterion(y_pred, training_set.target))
        losses1.append(loss)
        
        y_pred = model(testing_set.cats)
        loss = torch.sqrt(criterion(y_pred, testing_set.target))
        losses2.append(loss)
        
        y_pred = pred = torch.nn.functional.softmax(y_pred, dim=1).data.numpy()
        y_true = testing_set.target.data.numpy()
        auc = metrics.roc_auc_score(y_true, y_pred[:,1])
    
    # a neat trick to save screen space:
    if i%2 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}  auc: {auc:10.8f}')

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

epoch:   1  loss: 0.87303334  auc: 0.55625808
epoch:   3  loss: 0.84311676  auc: 0.58077575
epoch:   5  loss: 0.82545984  auc: 0.59362099
epoch:   7  loss: 0.81419414  auc: 0.59871633
epoch:   9  loss: 0.80374485  auc: 0.60800365
epoch:  11  loss: 0.79543352  auc: 0.61445643
epoch:  13  loss: 0.78801525  auc: 0.62219480
epoch:  15  loss: 0.78267097  auc: 0.62476562
epoch:  17  loss: 0.77754110  auc: 0.63200862
epoch:  19  loss: 0.77440000  auc: 0.63501082
epoch:  21  loss: 0.77129209  auc: 0.64194526
epoch:  23  loss: 0.76949745  auc: 0.64503570
epoch:  25  loss: 0.76781851  auc: 0.64877689
epoch:  27  loss: 0.76625645  auc: 0.65258500
epoch:  29  loss: 0.76502997  auc: 0.65629568
epoch:  31  loss: 0.76391059  auc: 0.65933735
epoch:  33  loss: 0.76343554  auc: 0.66017238
epoch:  35  loss: 0.76288426  auc: 0.66202923
epoch:  37  loss: 0.76204658  auc: 0.66511636
epoch:  39  loss: 0.76139408  auc: 0.66678512
epoch:  41  loss: 0.76104212  auc: 0.66809907
epoch:  43  loss: 0.76030177  auc:

### use one-hot encoding

In [76]:
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, dt, target, cats=None, IDs=None, size=None, seed=123):
        'Initialization'
        if IDs is None:
            np.random.seed(seed)
            self.ids = list(np.where(np.random.rand(dt.shape[0]) < size)[0])
            print('len of dataset: '+str(len(self.ids)))
        else:
            self.ids = IDs
        
        self.target = torch.from_numpy(dt[target][self.ids].values.astype(np.int64)).flatten() #.reshape(-1,1))
        dt = dt.drop(target, axis=1)
        if cats is not None:
            self.cats  = torch.from_numpy(dt[cats].iloc[self.ids].values.astype(np.int64))
            dt = dt.drop(cats, axis=1)
        if dt.shape[0] > 0:
            self.data  = torch.from_numpy(dt.iloc[self.ids].values.astype(np.float32))
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.ids)
    
    def __getitem__(self, index):
        'Generates one sample of data'
#        print(index, self.data[index])

        return self.data[index], self.cats[index], self.target[index]

In [150]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, out_sz, layers, p=0.5):
        super().__init__()
        
        layerlist = []
        n_emb = sum((ni for ni,nf in emb_szs))
        n_in = n_emb
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat):
        
        x = torch.nn.functional.one_hot(x_cat[:,0])
        for i in range(1,x_cat.size()[1]):
            x = torch.cat([x, torch.nn.functional.one_hot(x_cat[:,i])], 1)
        x = x.type(torch.FloatTensor)
        
        bt = nn.BatchNorm1d(x.size()[1])
        x = bt(x)
        x = self.layers(x)
        return x

In [151]:
# Parameters
params = {'batch_size': 1000,
          'shuffle': False,
          'num_workers': 6}

np.random.seed(123)
size = 0.5
rand = np.random.rand(df.shape[0])
train_ids = list(np.where(rand < size)[0])
test_ids = list(np.where(rand >= size)[0])

print(len(train_ids), len(test_ids))

training_set = Dataset(df[cats+['target']], 'target', cats=cats, IDs=train_ids)
training_generator = data.DataLoader(training_set, **params)

testing_set = Dataset(df[cats+['target']], 'target', cats=cats, IDs=test_ids)
testing_generator = data.DataLoader(training_set, **params)

149760 150240


In [152]:
torch.manual_seed(33)
model = TabularModel(emb_szs, 2, [200, 100], p=0.2)
model

TabularModel(
  (layers): Sequential(
    (0): Linear(in_features=83, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=100, out_features=2, bias=True)
  )
)

In [153]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [154]:
import time
from sklearn import metrics
start_time = time.time()

epochs = 10
losses1 = []
losses2 = []

for i in range(epochs):
    i+=1
    
    for continuous, categoricals, target in training_generator:
        y_pred = model(categoricals)
        loss = torch.sqrt(criterion(y_pred, target))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    
    with torch.set_grad_enabled(False):
        y_pred = model(training_set.cats)
        loss = torch.sqrt(criterion(y_pred, training_set.target))
        losses1.append(loss)
        
        y_pred = model(testing_set.cats)
        loss = torch.sqrt(criterion(y_pred, testing_set.target))
        losses2.append(loss)
        
        y_pred = pred = torch.nn.functional.softmax(y_pred, dim=1).data.numpy()
        y_true = testing_set.target.data.numpy()
        auc = metrics.roc_auc_score(y_true, y_pred[:,1])
    
    # a neat trick to save screen space:
    if i%2 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}  auc: {auc:10.8f}')

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

epoch:   1  loss: 0.75473785  auc: 0.69374832
epoch:   3  loss: 0.74602073  auc: 0.70898513
epoch:   5  loss: 0.74557894  auc: 0.71006836
epoch:   7  loss: 0.74558628  auc: 0.70974565
epoch:   9  loss: 0.74656123  auc: 0.70786678
epoch:  10  loss: 0.74645495

Duration: 96 seconds


In [155]:
rows = 5
correct = 0
y_val = model(testing_set.cats)
print(f'{"MODEL OUTPUT":26} ARGMAX  Y_TEST')
for i in range(rows):
    print(f'{str(y_val[i]):26} {y_val[i].argmax():^7}{y_true[i]:^7}')
    if y_val[i].argmax().item() == y_true[i]:
        correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')

MODEL OUTPUT               ARGMAX  Y_TEST
tensor([ 0.1216, -0.5535], grad_fn=<SelectBackward>)    0      0   
tensor([ 0.4670, -0.2198], grad_fn=<SelectBackward>)    0      1   
tensor([ 0.1880, -0.9448], grad_fn=<SelectBackward>)    0      0   
tensor([-0.4727, -0.3521], grad_fn=<SelectBackward>)    1      0   
tensor([ 0.4123, -0.5494], grad_fn=<SelectBackward>)    0      0   

3 out of 5 = 60.00% correct


## one hot all data

In [126]:
dummies = pd.get_dummies(df,
                         columns=df.columns,
                         drop_first=True,
                         sparse=True)

In [128]:
dummies.shape

(300000, 16439)