# Implementing CNN In Pytorch

In [1]:
import torch

batch_size = 3
one_hot_size = 10
sequence_width = 7

data = torch.randn(batch_size, one_hot_size, sequence_width)
conv1 = torch.nn.Conv1d(in_channels= one_hot_size,
                       out_channels= 16,
                       kernel_size= 3)

intermediate1 = conv1(data)

print(data.size())
print(data)
print(intermediate1.size())
print(intermediate1)

torch.Size([3, 10, 7])
tensor([[[-0.6152,  0.2864, -1.1016, -0.8544,  0.7845, -0.6594,  0.4768],
         [-1.4123, -0.2553, -0.0138, -0.1109,  0.0127,  0.9263,  1.7683],
         [-0.3195,  0.0191,  1.1332,  0.6086, -2.4340,  0.3850,  0.6449],
         [ 0.9858, -1.3809, -0.4223,  0.7069,  0.5861,  0.1291, -0.2257],
         [ 1.4455, -0.6440, -0.2073,  0.1421, -1.0787,  2.3157,  0.8062],
         [ 0.6942,  0.1595,  0.9772, -1.9867,  1.5214,  0.3742, -0.2273],
         [ 1.5826,  0.1628,  1.3762, -1.1712,  0.8213,  0.8788,  1.0851],
         [ 2.0753,  0.0300,  1.1697, -0.4630,  0.3116, -1.6441,  0.2856],
         [ 1.4191, -0.7567,  1.1687,  0.1607, -0.0891,  1.4693,  2.2877],
         [ 0.0958, -0.2486, -0.3046,  0.9475,  0.4833,  0.1425, -2.4979]],

        [[-0.4362,  0.2476,  1.5053, -1.0689,  0.4496,  0.4616, -1.3539],
         [ 0.3971, -0.9727, -1.2485, -1.2355, -0.2908, -1.0833, -0.8490],
         [ 0.7678,  0.0232,  1.5907, -0.7328, -1.7800,  0.9238,  0.7948],
         [ 0.

In [3]:
conv2 = torch.nn.Conv1d(in_channels=16,
                       out_channels=32,
                       kernel_size=3)
conv3 = torch.nn.Conv1d(in_channels=32,
                       out_channels=64,
                       kernel_size=3)

intermediate2 = conv2(intermediate1)
intermediate3 = conv3(intermediate2)

print(intermediate2.size())
print(intermediate2)

print(intermediate3.size())
print(intermediate3)

torch.Size([3, 32, 3])
tensor([[[ 3.4884e-01, -1.1175e-02, -1.3128e-01],
         [-1.9049e-01, -6.8913e-03, -1.5062e-02],
         [-2.0335e-02, -1.5935e-01,  4.1573e-01],
         [-4.6385e-01, -3.5354e-02, -9.2693e-02],
         [-9.1045e-01,  7.2159e-01, -4.0178e-01],
         [-1.0951e-01, -3.2347e-01, -3.4278e-01],
         [ 5.4106e-01,  5.1844e-01,  4.1303e-01],
         [ 1.5835e-01, -1.0814e-01, -2.5527e-01],
         [-1.9036e-01,  1.1965e-01, -4.3349e-02],
         [ 6.6291e-02, -3.4554e-01,  1.7448e-02],
         [ 9.3084e-02,  8.6087e-02,  5.1484e-01],
         [ 1.8293e-01, -2.7182e-01,  5.7877e-01],
         [ 5.3491e-01, -4.7961e-01,  6.8677e-01],
         [-1.1597e-01,  1.3256e-01, -2.9553e-01],
         [ 5.7226e-02, -3.2718e-01,  5.3132e-02],
         [ 9.6140e-02, -1.4833e-01,  2.2031e-01],
         [ 9.0165e-02, -2.9594e-01,  5.0259e-01],
         [-5.0057e-01,  4.4018e-01, -2.3170e-01],
         [-2.4062e-01,  1.4620e-01, -4.5579e-01],
         [-4.4965e-01,  1.6

# Surname Classification

In [16]:
import numpy as np
from collections import Counter
import string
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader
from argparse import Namespace
import torch.optim as optim
from tqdm.notebook import tqdm

## Vocabulary

In [5]:
# create vocabulary class
class Vocabulary(object):
    """Class to extract and process vocabularies for mapping"""
    
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {
            idx: token for token, idx in self._token_to_idx.items()
        }
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serializeable(self):
        """return a serializeable dictionary"""
        return {
            'token_to_idx': self._token_to_idx,
            'add_unk': self._add_unk,
            'unk_token': self._unk_token
        }
    
    @classmethod
    def from_serializeable(cls, contents):
        """create vocabulary object from serialize dictionary"""
        return cls(**contents)
    
    def add_token(self, token):
        """Add a token and return it's index"""
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """get the index of a token 
        if not exist returns the unk_index"""
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index %d is not in the vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

## Vectorizer

In [6]:
class SurnameVectorizer(object):
    def __init__(self, character_vocab, nationality_vocab, max_surname_length):
        self.character_vocab = character_vocab
        self.nationality_vocab = nationality_vocab
        self.max_surname_length = max_surname_length
        
    def vectorize(self, surname):
        """Create one_hot vector for review
        Args:
            surname (str): the surname
        Returns:
            one_hot_matrix (ndarray): matrix of one hot vectors
        """
        # X = surname_vocab
        # Y = max_surname_length
        one_hot_matrix_size = (len(self.character_vocab), self.max_surname_length)
        one_hot_matrix = np.zeros(one_hot_matrix_size, dtype=np.float32)
        
        for position_index, character in enumerate(surname):
            character_index = self.character_vocab.lookup_token(character)
            one_hot_matrix[character_index][position_index] = 1
            
        return one_hot_matrix
    
    @classmethod
    def from_dataframe(cls, dataframe, cutoff=25):
        """Instantiate a ReviewVector from dataset"""
        
        character_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)
        max_surname_length = 0
        
        for index, row in dataframe.iterrows():
            # check for longgest surname length
            max_surname_length = max(max_surname_length, len(row.surname))
            
            for letter in row.surname:
                character_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)
            
        return cls(surname_vocab, nationality_vocab, max_surname_length)

## Dataset

In [8]:
class SurnameDataset(Dataset):
    def __init__(self, surname_df, vectorizer):
        self.surname_df = surname_df
        self._vectorizer = vectorizer
        
        self.train_df = self.surname_df[self.surname_df.split == 'train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.surname_df[self.surname_df.split == 'val']
        self.val_size = len(self.val_df)
        
        self.test_df = self.surname_df[self.surname_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.val_size),
                            'test': (self.test_df, self.test_size)}
        
        self.set_split('train')

        # Class weights to use with cross entropy
        class_counts = surname_df.nationality.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.nationality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv, cuda=False):
        """Load dataset from csv and returns the dataset object
        and vectorizer"""
        surname_df = pd.read_csv(surname_csv)
        train_surname_df = surname_df[surname_df.split == 'train']
        return cls(surname_df,
                   SurnameVectorizer.from_dataframe(train_surname_df))
    
    def get_vectorizer(self):
        """Get vectorizer"""
        return self._vectorizer
    
    def set_split(self, split='train'):
        """Set the split from data"""
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        
        surname_matrix = self._vectorizer.vectorize(row.surname)
        
        nationality_index = self._vectorizer.nationality_vocab.lookup_token(row.nationality)
        
        return {
            'x_data' : surname_matrix,
            'y_target' : nationality_index
        }
    
    def get_num_batches(self, batch_size):
        """Given the batch size return the number of batches in the dataset"""
        return len(self) // batch_size

## Classifier using CNN

In [10]:
class SurnameClassifier(nn.Module):
    """CNN Classifier"""
    
    def __init__(self, initial_num_channels, num_classes, num_channels, dropout):
        super(SurnameClassifier, self).__init__()
        
        self._dropout = dropout
        
        # convolution layer
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=initial_num_channels,
                     out_channels=num_channels,
                     kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,
                     out_channels=num_channels,
                     kernel_size=3,
                     stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,
                     out_channels=num_channels,
                     kernel_size=3,
                     stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels,
                     out_channels=num_channels,
                     kernel_size=3),
            nn.ELU()
        )
        
        self.fc = nn.Linear(num_channels, num_classes)
        
    def forward(self, x_in, apply_softmax=False):
        """Forward pass the network given the x_in
        Args:
            x_in (torch.Tensor): input data Tensor
            with shape (batch_size, initial_num_channels, max_surname_length)
        Returns:
            resulting tensor with shape (batch, num_classes)
        """
        
        features = self.convnet(x_in).squeeze(dim=2)
        prediction_vector = self.fc(features)
    
        # Adding dropout
        # only applied in training
        dropout = torch.nn.Dropout(p=self._dropout)
        if self.training:
            prediction_vector = self.fc2(dropout(intermediate_vector))
        else:
            prediction_vector = self.fc2(intermediate_vector)
        
        if apply_softmax:
            prediction_vector = torch.softmax(prediction_vector, dim=1)
        
        return prediction_vector

## Training Routine

In [14]:
args = Namespace(
    # Data information
    frequency_cutoff = 25,
    model_state_file = '/content/drive/My Drive/Colab Notebooks/Data/surname_dataset/model.pth',
    surname_csv = '/content/drive/My Drive/Colab Notebooks/Data/surname_dataset/surnames_with_splits.csv',
    save_dir = '/content/drive/My Drive/Colab Notebooks/Data/surname_dataset/',
    vectorizer_file = '/content/drive/My Drive/Colab Notebooks/Data/surname_dataset/vectorizer.json',
    # Model HyperParameters
    hidden_dim = 500,
    num_channels = 256,
    # Training HyperParameters
    batch_size = 128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    cuda=True,
    dropout=0.1
)

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
    ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [15]:
# create variables to record
# the training process
def make_train_state(args):
    return {
        'epoch_index':0,
        'train_loss':[],
        'train_acc':[],
        'val_loss': [],
        'val_acc': [],
        'test_loss': -1,
        'test_acc': -1,
    }

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

train_state = make_train_state(args)

if torch.cuda.is_available() and args.cuda:
  args.cuda = True
else:
  args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Device available ", args.device)

# dataset object
dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
dataset.class_weights = dataset.class_weights.to(args.device)

# vectorizer
vectorizer = dataset.get_vectorizer()

# classifier
classifier = SurnameClassifier(initial_num_channels=len(vectorizer.surname_vocab),
                               num_classes = len(vectorizer.nationality_vocab),
                               num_channels=args.num_channels,
                               dropout=args.dropout)
classifier = classifier.to(args.device)

# loss function and optimizer
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

print("Input dim ", len(vectorizer.surname_vocab))
print("Output dim ", len(vectorizer.nationality_vocab))

Device available  cpu


FileNotFoundError: [Errno 2] File /content/drive/My Drive/Colab Notebooks/Data/surname_dataset/surnames_with_splits.csv does not exist: '/content/drive/My Drive/Colab Notebooks/Data/surname_dataset/surnames_with_splits.csv'

In [None]:
# Create training loop
epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size)-1, 
                          position=1, 
                          leave=True)

dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size)-1, 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index
        # setup batch generator
        # set loss and train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset=dataset,
                                        batch_size=args.batch_size,
                                        device=args.device)
      
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
      
        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1 zero the gradients
            optimizer.zero_grad()
          
            # step 2 compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
          
            # step 3 compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
          
            # step 4 use loss to produce gradients
            loss.backward()
          
            # step 5 use optimizer to take the gradient step
            optimizer.step()
          
            # step 6 compute the acccuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, 
                                acc=running_acc, 
                                epoch=epoch_index)
            train_bar.update()
          
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)
      
        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0, set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset,
                                        batch_size=args.batch_size,
                                        device=args.device)
      
        running_loss = 0.
        running_acc = 0.
        classifier.eval()
      
        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
          
            # step 2. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
          
            # step 3. compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)
            train_state['val_loss'].append(running_loss)
            train_state['val_acc'].append(running_acc)
            
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()
          
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop") 