### Mike Ogrysko
### CS 766 Information Retrieval and Natural Language Processing

Build classifier to distinguish between English and Scottish surnames
- Use surnames dataset
- PyTorch RNN with 10-fold cross validation performance
- Tune by setting WEIGHTS to None
- Set performance metric to F1-score and compare results


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from os import listdir, path
import itertools
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [1]:
PATH_DATA = 'surnames/'

# Size of the longest surname, T
SEQ_SIZE = 20

LANGS = ('English', 'Scottish')

LANGS_CAT = dict(zip(LANGS, range(len(LANGS))))

**PyTorch RNN with 10-fold cross validation performance**

In [4]:
# Letter index 0 is the padding value, i.e. padding to fill up the vector to SEQ_SIZE, necessary for batched
# Note that eventually we will use torch Tensor to represent these fixed length sequences
LetterVocabulary, LetterVocabularyIndex, Index2Voc, Sequences = {' ':0}, 1, {0:' '}, {}
for fn in sorted([_ for _ in listdir(PATH_DATA) if _.endswith('.txt')]):
    lang, seqs = path.splitext(path.basename(fn))[0], []

    if lang not in LANGS:  # test case
        continue

    with open(path.join(PATH_DATA, fn), 'r', encoding="utf8") as fin:
        for row in fin.read().splitlines():
            seq = np.zeros(SEQ_SIZE, dtype=np.int32)
            for i, letter in enumerate(row.lower()):  # Convert the surname to lower case
#            for i, letter in enumerate(row):
                if i < SEQ_SIZE:
                    if letter not in LetterVocabulary:
                        LetterVocabulary[letter] = LetterVocabularyIndex
                        Index2Voc[LetterVocabularyIndex] = letter
                        LetterVocabularyIndex += 1
                    seq[i] = LetterVocabulary[letter]
            seqs += [seq]
    Sequences[lang] = seqs


In [5]:
# Sanity
N = sum([len(Sequences[_]) for _ in Sequences])

T = Sequences['English'][0].shape[0]

C = len(np.unique(Sequences.keys())[0])

print(N, T, C)

3768 20 2


In [6]:
# Pool all sequences and all languages
Seqs = [Sequences[LANGS[_]] for _ in range(C)]
Seqs = list(itertools.chain(*Seqs))

# Number of features is the number of unique characters
M = np.max(Seqs)
print(f'M= {M}')

M= 26


In [7]:
# Apriori class balance, i.e. inverse probability of the class
nk = np.array([len(Sequences[LANGS[_]]) for _ in range(C)], dtype=np.float32)
nk = (N/nk)
nk = nk/nk.sum()

# Class weights, inverse apriori probability
WEIGHTS = torch.tensor(nk, dtype=torch.float32)

print(WEIGHTS)

tensor([0.0265, 0.9735])


In [8]:
# Ground truth
y = [[_]*len(Sequences[LANGS[_]]) for _ in range(C)]
y = np.array(list(itertools.chain(*y)))

In [9]:
# One-hot encode every position of the sequence
X = np.empty((N,M))
n = 0
for lang in Sequences.keys():
    for seq in Sequences[lang]:
        sxx = np.zeros((M,), dtype=np.float32)
        for i in range(SEQ_SIZE):  # for the duration of the signal
            if seq[i] > 0:
                sxx[seq[i]-1] = 1
        X[n] = sxx
        n += 1

In [10]:
# One-hot encode every position of the sequence
# List of sequence, language tuples for easy shuffling
def get_Xy():
    Xy = []
    for lang in Sequences.keys():
        for seq in Sequences[lang]:
            T = SEQ_SIZE  # necessary for batched
            sxx = np.zeros((T, M))
            for i in range(T):  # for the duration of the signal
                if seq[i] > 0:
                    sxx[i, seq[i]-1] = 1
            Xy += [(torch.tensor(sxx, dtype=torch.float32),
                    torch.tensor([LANGS_CAT[lang]], dtype=torch.int64))]
    return Xy

# Helper functions
def get_X(_Xy):
    return [_[0] for _ in _Xy]
    
def get_y(_Xy):
    return [int(_[1].data[0]) for _ in _Xy]

# Sanity
Xy = get_Xy()
print(len(Xy))

# printing the confusion matrix below
def get_cm(_y, _p):
    from sklearn.metrics import confusion_matrix
    import pandas as pd

    cm = confusion_matrix(_y, _p, labels=list(range(len(LANGS))))
    display(pd.DataFrame(cm, index=[_[:5] for _ in LANGS], columns=[_[:5] for _ in LANGS]))

3768


In [11]:
# Set the GPU to device 0
# gpu = torch.device('cuda:0')
#gpu = torch.device('mps') for mac
gpu = torch.device('cpu')

class My_RNN(nn.Module):
    
    def __init__(self, n_hidden, n_hid_layers=1, epochs=10, eta=0.0005, batch_size=100, weight=None, info=True):
        """ A PyTorch neural network model based on RNN cell, batched """
        super(My_RNN, self).__init__()

        self.n_hidden= n_hidden  # hidden layer size
        self.n_hid_layers= n_hid_layers  # number of hidden layers
        self.epochs= epochs  # number of learning iterations
        self.eta= eta  # learning rate
        self.B= batch_size  # size of training batch - 1 would not work
        self.info= info  # debug info
        
        self.rnn, self.outlayer = None, None

        self.softmax = nn.LogSoftmax(dim=1)
        # loss function, since the last layer is nn.LogSoftmax
        self.criterion = nn.NLLLoss(weight=weight)

    def forward(self, _X, _h0):
        output, hn = self.rnn(_X, _h0)
        output = self.outlayer(output[:, -1, :])  # output is batched
        output = self.softmax(output)
        return output, hn
    
    def init_cell(self, _M):  # Create variations of our RNN by overriding init_cell
        dropout = 0.2 if self.n_hid_layers > 1 else 0
        return nn.RNN(_M, self.n_hidden, self.n_hid_layers,
                      nonlinearity='relu',
                      bias=False, batch_first=True, dropout=dropout)

    def init_hidden(self, _B):  # batch_first = True
        return torch.zeros(self.n_hid_layers, _B, self.n_hidden).to(gpu)  # Extra dimension - batch

    def fit(self, _Xy):
        from random import shuffle
        import sys
        import torch.optim as optim

        M= _Xy[0][0].shape[1]  # number of features, based on batch input
        C= np.unique([int(_[1].data[0]) for _ in _Xy]).shape[0]  # number of class labels

        self.rnn = self.init_cell(M).to(gpu)
        self.outlayer = nn.Linear(self.n_hidden, C).to(gpu)
        
        self.optimizer = optim.Adam(self.parameters(), lr=self.eta)
        
        for i in range(self.epochs):
            # Shuffle the input to randomly interleave classes, note that they are tuples, i.e. (x, y)
            shuffle(_Xy)

            N = len(_Xy)
            L, totloss = 0, 0

            while L < N-self.B:
                sxx = torch.stack([_[0] for _ in _Xy[L:L+self.B]]).to(gpu)
                y = torch.tensor([_[1] for _ in _Xy[L:L+self.B]], dtype=torch.int64).to(gpu)
                output, loss = self.train_signal(sxx, y, self.B)
                
                totloss += loss
                L += self.B
                
                if self.info:
                    sys.stderr.write(f"\r{i+1:03d}/{self.epochs:4d} | Loss: {loss:6.2f} | "
                                     f"Avg loss: {totloss/(i+1):6.2f} | {y.data.tolist()[0]}")
                    sys.stderr.flush()
    
    def train_signal(self, _sxx, _y, _B):
        h0 = self.init_hidden(_B)
        self.optimizer.zero_grad()

        output, hn = self.forward(_sxx, h0)

        loss = self.criterion(output, _y)
        loss.backward()
        self.optimizer.step()
        return output, loss.item()

    def predict(self, _sxx):  # Tensor dimensions: B x T x M
        _sxx = torch.stack(_sxx)
        with torch.no_grad():
            h0 = self.init_hidden(_sxx.shape[0])  # reset the hidden layer
            output, hn = self.forward(_sxx.to(gpu), h0)

        p_values, indices = output.max(dim=1)
        return indices.to('cpu')


# Info about the RNN
print(My_RNN(10, n_hid_layers=1, eta=0.001))

My_RNN(
  (softmax): LogSoftmax(dim=1)
  (criterion): NLLLoss()
)


In [15]:
class My_LSTM(My_RNN):

    def __init__(self, n_hidden, n_hid_layers=1, epochs=10, eta=0.0005, batch_size=100, weight=None, info=True):
        """ A PyTorch neural network model based on LSTM RNN cell, batched """
        super(My_LSTM, self).__init__(n_hidden, n_hid_layers=n_hid_layers,
                                      epochs=epochs, eta=eta, batch_size=batch_size, weight=weight, info=info)

    def init_hidden(self, _B):  # batch_first = True
        return (torch.zeros(self.n_hid_layers, _B, self.n_hidden).to(gpu),
                torch.zeros(self.n_hid_layers, _B, self.n_hidden).to(gpu))

    def init_cell(self, _M):  # override
        dropout = 0.2 if self.n_hid_layers > 1 else 0
        return nn.LSTM(_M, self.n_hidden, self.n_hid_layers,
                       #nonlinearity='relu',
                       bias=False, batch_first=True, dropout=dropout)

In [16]:
Xy = get_Xy()

In [17]:
def kfoldRnnLSTM(_Xy, _weights):
    cm_y, cm_p = [], []
    Acc = []
    kf = StratifiedKFold(n_splits=10)
    for tr_ix, ts_ix in kf.split(np.arange(len(_Xy)), get_y(_Xy)):
        rnn = My_LSTM(128, n_hid_layers=1, epochs=1000, eta=0.005, batch_size=2000, weight=_weights, info=True).to(gpu)

        X_tr = [_Xy[_] for _ in tr_ix]  # predict uses X and y as a tuple
        X_ts = get_X([_Xy[_] for _ in ts_ix])
        y_ts = get_y([_Xy[_] for _ in ts_ix])

        rnn.fit(X_tr)
        y_pred = rnn.predict(X_ts)

        Acc += [np.sum(np.array(y_pred) == np.array(y_ts))/len(y_pred)]

        cm_y += y_ts
        cm_p += y_pred.tolist()

    return Acc, cm_y, cm_p


In [18]:
%%time
WEIGHTS = torch.tensor(nk, dtype=torch.float32)
Acc_we = kfoldRnnLSTM(Xy, WEIGHTS)
print(f'RNN 10-fold CV Acc= {np.mean(Acc_we[0]):.2f} {chr(177)}{np.std(Acc_we[0]):.3f}')

1000/1000 | Loss:   0.20 | Avg loss:   0.00 | 0

RNN 10-fold CV Acc= 0.59 ±0.273
CPU times: user 4h 26min 36s, sys: 3h 17min 32s, total: 7h 44min 8s
Wall time: 1h 14min 42s


In [19]:
get_cm(Acc_we[1], Acc_we[2])

Unnamed: 0,Engli,Scott
Engli,2213,1455
Scott,73,27


**Tune by setting WEIGHTS to None**

In [20]:
%%time
WEIGHTS = None
Acc_nwe = kfoldRnnLSTM(Xy, WEIGHTS)
print(f'RNN 10-fold CV Acc (weights none)= {np.mean(Acc_nwe[0]):.2f} {chr(177)}{np.std(Acc_nwe[0]):.3f}')

1000/1000 | Loss:   0.12 | Avg loss:   0.00 | 0

RNN 10-fold CV Acc (weights none)= 0.95 ±0.073
CPU times: user 4h 35min 46s, sys: 3h 59min 46s, total: 8h 35min 33s
Wall time: 2h 3min 13s


In [21]:
get_cm(Acc_nwe[1], Acc_nwe[2])

Unnamed: 0,Engli,Scott
Engli,3576,92
Scott,100,0


**Set performance metric to F1-score and compare results**

In [22]:
def kfoldRnnLSTMF1(_Xy, _weights):
    cm_y, cm_p = [], []
    Acc = []
    kf = StratifiedKFold(n_splits=10)
    for tr_ix, ts_ix in kf.split(np.arange(len(_Xy)), get_y(_Xy)):
        rnn = My_LSTM(128, n_hid_layers=1, epochs=1000, eta=0.005, batch_size=2000, weight=_weights, info=True).to(gpu)

        X_tr = [_Xy[_] for _ in tr_ix]  # predict uses X and y as a tuple
        X_ts = get_X([_Xy[_] for _ in ts_ix])
        y_ts = get_y([_Xy[_] for _ in ts_ix])

        rnn.fit(X_tr)
        y_pred = rnn.predict(X_ts)

        #Acc += [np.sum(np.array(y_pred) == np.array(y_ts))/len(y_pred)]
        Acc += [f1_score(np.array(y_ts),y_pred)]

        cm_y += y_ts
        cm_p += y_pred.tolist()

    return Acc, cm_y, cm_p


In [23]:
%%time
WEIGHTS = torch.tensor(nk, dtype=torch.float32)
f1s_we = kfoldRnnLSTMF1(Xy, WEIGHTS)
print(f'RNN F1 Score (Weights) = {np.mean(f1s_we[0]):.3f} {chr(177)}{np.std(f1s_we[0]):.3f}')

1000/1000 | Loss:   0.50 | Avg loss:   0.00 | 0

RNN F1 Score (Weights) = 0.111 ±0.149
CPU times: user 4h 24min 13s, sys: 3h 32min 14s, total: 7h 56min 28s
Wall time: 1h 40min 15s


In [24]:
%%time
WEIGHTS = None
f1s_nwe = kfoldRnnLSTMF1(Xy, WEIGHTS)
print(f'RNN F1 Score (No Weights) = {np.mean(f1s_nwe[0]):.3f} {chr(177)}{np.std(f1s_nwe[0]):.3f}')

1000/1000 | Loss:   0.10 | Avg loss:   0.00 | 0

RNN F1 Score (No Weights) = 0.000 ±0.000
CPU times: user 4h 32min 13s, sys: 3h 56min 2s, total: 8h 28min 15s
Wall time: 1h 21min 12s
