In [1]:
from types import SimpleNamespace
from collections import Counter
import os
import re
import pathlib
import array
import pickle
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

In [2]:
DATASET_VERSION = 'ca-100'
COMPETITION_ROOT = '../input/vectors'
DATASET_ROOT = f'../input/cbow-preprocessing/data/{DATASET_VERSION}'
WORKING_ROOT = f'data/{DATASET_VERSION}'
DATASET_PREFIX = 'ca.wiki'

In [3]:
params = SimpleNamespace(
    embedding_dim = 100,
    window_size = 5,
    batch_size = 1000,
    epochs = 4,
    preprocessed = f'{DATASET_ROOT}/{DATASET_PREFIX}',
    working = f'{WORKING_ROOT}/{DATASET_PREFIX}',
    modelname = f'{WORKING_ROOT}/{DATASET_VERSION}',
    train = True
)

In [4]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    pathlib.Path('/content/drive/My Drive/POE/vectors').mkdir(parents=True, exist_ok=True)
    os.chdir('/content/drive/My Drive/POE/vectors')
except:
    pass

In [5]:
class Vocabulary(object):
    def __init__(self, pad_token='<pad>', unk_token='<unk>', eos_token='<eos>'):
        self.token2idx = {}
        self.idx2token = []
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.eos_token = eos_token
        if pad_token is not None:
            self.pad_index = self.add_token(pad_token)
        if unk_token is not None:
            self.unk_index = self.add_token(unk_token)
        if eos_token is not None:
            self.eos_index = self.add_token(eos_token)

    def add_token(self, token):
        if token not in self.token2idx:
            self.idx2token.append(token)
            self.token2idx[token] = len(self.idx2token) - 1
        return self.token2idx[token]

    def get_index(self, token):
        if isinstance(token, str):
            return self.token2idx.get(token, self.unk_index)
        else:
            return [self.token2idx.get(t, self.unk_index) for t in token]

    def __len__(self):
        return len(self.idx2token)

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.__dict__.update(pickle.load(f))

In [6]:
def batch_generator(idata, target, batch_size, shuffle=True):
    nsamples = len(idata)
    if shuffle:
        perm = np.random.permutation(nsamples)
    else:
        perm = range(nsamples)

    for i in range(0, nsamples, batch_size):
        batch_idx = perm[i:i+batch_size]
        if target is not None:
            yield idata[batch_idx], target[batch_idx]
        else:
            yield idata[batch_idx], None

CBOW model
----------
You can add new parameters to the model in the *\_\_init\_\_()* method with *self.register_buffer()* (for parameters not to be trained):

    self.register_buffer('position_weight', torch.tensor([1,2,2,1], dtype=torch.float32))

or *nn.Parameter()* (for parameters to be trained)

    self.position_weight = nn.Parameter(torch.tensor([1,2,2,1], dtype=torch.float32))
    
In both cases, you can reference and use them in the *forward* method as

    self.position_weight

In [7]:
class CBOW(nn.Module):
    def _init_(self, num_embeddings, embedding_dim):
        super()._init_()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)

    # B = Batch size
    # W = Number of context words (left + right)
    # E = embedding_dim
    # V = num_embeddings (number of words)
    def forward(self, input):
        # input shape is (B, W)
        e = self.emb(input)
        # e shape is (B, W, E)
        u = e.sum(dim=1)
        # u shape is (B, E)
        v = self.lin(u)
        # v shape is (B, V)
        return v

In [8]:
class CBOWa(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)
        self.register_buffer('position_weight', torch.tensor([1,2,2,1], dtype=torch.float32))

    def forward(self, inputs): #Inputs és l'índex de la paraula. Amb nn.Embedding la passem a un vector?
        # B * W1
        U = (self.emb(inputs) * self.position_weight.view(1,4,1)).sum(dim=1)
        
        # B * E
        V = self.lin(U)
        # B * V
        return V

In [9]:
class CBOWb(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)
        self.position_weight = nn.Parameter(torch.tensor([1,2,2,1], dtype=torch.float32))

    def forward(self, inputs): #Inputs és l'índex de la paraula. Amb nn.Embedding la passem a un vector?
        # B * W1
        U = (self.emb(inputs) * self.position_weight.view(1,4,1)).sum(dim=1)
        
        # B * E
        V = self.lin(U)
        # B * V
        return V

In [10]:
c_params = [[1 for i in range(params.embedding_dim)] for j in range(params.window_size-1)]

In [11]:
class CBOWc(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)
        self.position_weight = nn.Parameter(torch.tensor(c_params, dtype=torch.float32))

    def forward(self, inputs): #Inputs és l'índex de la paraula. Amb nn.Embedding la passem a un vector?
        # B * W1
        U = (self.emb(inputs) * self.position_weight.view(1,4,-1)).sum(dim=1)
        
        # B * E
        V = self.lin(U)
        # B * V
        return V

In [12]:
def load_preprocessed_dataset(prefix):
    # Try loading precomputed vocabulary and preprocessed data files
    token_vocab = Vocabulary()
    token_vocab.load(f'{prefix}.vocab')
    data = []
    for part in ['train', 'valid', 'test']:
        with np.load(f'{prefix}.{part}.npz') as set_data:
            idata, target = set_data['idata'], set_data['target']
            data.append((idata, target))
            print(f'Number of samples ({part}): {len(target)}')
    print("Using precomputed vocabulary and data files")
    print(f'Vocabulary size: {len(token_vocab)}')
    return token_vocab, data

In [13]:
def train(model, criterion, optimizer, idata, target, batch_size, device, log=False):
    model.train()
    total_loss = 0
    ncorrect = 0
    ntokens = 0
    niterations = 0
    for X, y in batch_generator(idata, target, batch_size, shuffle=True):
        # Get input and target sequences from batch
        X = torch.tensor(X, dtype=torch.long, device=device)
        y = torch.tensor(y, dtype=torch.long, device=device)

        model.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        # Training statistics
        total_loss += loss.item()
        ncorrect += (torch.max(output, 1)[1] == y).sum().item()
        ntokens += y.numel()
        niterations += 1
        if niterations == 200 or niterations == 500 or niterations % 1000 == 0:
            print(f'Train: wpb={ntokens//niterations}, num_updates={niterations}, accuracy={100*ncorrect/ntokens:.1f}, loss={total_loss/ntokens:.2f}')

    total_loss = total_loss / ntokens
    accuracy = 100 * ncorrect / ntokens
    if log:
        print(f'Train: wpb={ntokens//niterations}, num_updates={niterations}, accuracy={accuracy:.1f}, loss={total_loss:.2f}')
    return accuracy, total_loss

In [14]:
def validate(model, criterion, idata, target, batch_size, device):
    model.eval()
    total_loss = 0
    ncorrect = 0
    ntokens = 0
    niterations = 0
    y_pred = []
    for X, y in batch_generator(idata, target, batch_size, shuffle=False):
        # Get input and target sequences from batch
        X = torch.tensor(X, dtype=torch.long, device=device)
        output = model(X)
        if target is not None:
            y = torch.tensor(y, dtype=torch.long, device=device)
            loss = criterion(output, y)
            total_loss += loss.item()
            ncorrect += (torch.max(output, 1)[1] == y).sum().item()
            ntokens += y.numel()
            niterations += 1
        else:
            pred = torch.max(output, 1)[1].detach().to('cpu').numpy()
            y_pred.append(pred)

    if target is not None:
        total_loss = total_loss / ntokens
        accuracy = 100 * ncorrect / ntokens
        return accuracy, total_loss
    else:
        return np.concatenate(y_pred)

In [15]:
# Create working dir
pathlib.Path(WORKING_ROOT).mkdir(parents=True, exist_ok=True)

In [16]:
# Select device
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print("WARNING: Training without GPU can be very slow!")

In [17]:
vocab, data = load_preprocessed_dataset(params.preprocessed)

Number of samples (train): 80234403
Number of samples (valid): 163012
Number of samples (test): 164055
Using precomputed vocabulary and data files
Vocabulary size: 100002


In [18]:
# 'El Periodico' validation dataset
valid_x_df = pd.read_csv(f'{COMPETITION_ROOT}/x_valid.csv')
tokens = valid_x_df.columns[1:]
valid_x = valid_x_df[tokens].apply(vocab.get_index).to_numpy(dtype='int32')
valid_y_df = pd.read_csv(f'{COMPETITION_ROOT}/y_valid.csv')
valid_y = valid_y_df['token'].apply(vocab.get_index).to_numpy(dtype='int32')

In [19]:
mm = CBOW()
dir(mm)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_name',
 '_init_',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_modules',
 '_named_members',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_tracing_name',
 '_version',
 'add_module',
 'apply',
 'buffers',
 'children',
 'cpu',
 'cuda',
 'double',
 'dump_patches',
 'eval',
 'extra_repr',
 'float',
 'forward',
 'half',
 'load_state_dict',
 'modules',
 'named_buffers',
 'na

In [20]:
models = {
    'a' : CBOWa(num_embeddings=len(vocab), embedding_dim=params.embedding_dim),
    'b' : CBOWb(num_embeddings=len(vocab), embedding_dim=params.embedding_dim),
    'c' : CBOWc(num_embeddings=len(vocab), embedding_dim=params.embedding_dim)
}

In [21]:

for model_type, model in models.items():
    model = model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(reduction='sum')

    train_accuracy = []
    wiki_accuracy = []
    valid_accuracy = []
    for epoch in range(params.epochs):
        acc, loss = train(model, criterion, optimizer, data[0][0], data[0][1], params.batch_size, device, log=True)
        train_accuracy.append(acc)
        print(f'| epoch {epoch:03d} | train accuracy={acc:.1f}%, train loss={loss:.2f}')
        acc, loss = validate(model, criterion, data[1][0], data[1][1], params.batch_size, device)
        wiki_accuracy.append(acc)
        print(f'| epoch {epoch:03d} | valid accuracy={acc:.1f}%, valid loss={loss:.2f} (wikipedia)')
        acc, loss = validate(model, criterion, valid_x, valid_y, params.batch_size, device)
        valid_accuracy.append(acc)
        print(f'| epoch {epoch:03d} | valid accuracy={acc:.1f}%, valid loss={loss:.2f} (El Periódico)')

    # Save model
    torch.save(model.state_dict(), f'{params.modelname}_{model_type}.pt')

Train: wpb=1000, num_updates=200, accuracy=3.6, loss=9.93
Train: wpb=1000, num_updates=500, accuracy=6.1, loss=8.84
Train: wpb=1000, num_updates=1000, accuracy=8.5, loss=8.06
Train: wpb=1000, num_updates=2000, accuracy=11.0, loss=7.36
Train: wpb=1000, num_updates=3000, accuracy=12.7, loss=7.01
Train: wpb=1000, num_updates=4000, accuracy=13.9, loss=6.78
Train: wpb=1000, num_updates=5000, accuracy=14.9, loss=6.61
Train: wpb=1000, num_updates=6000, accuracy=15.7, loss=6.49
Train: wpb=1000, num_updates=7000, accuracy=16.4, loss=6.38
Train: wpb=1000, num_updates=8000, accuracy=17.0, loss=6.29
Train: wpb=1000, num_updates=9000, accuracy=17.6, loss=6.22
Train: wpb=1000, num_updates=10000, accuracy=18.0, loss=6.15
Train: wpb=1000, num_updates=11000, accuracy=18.5, loss=6.09
Train: wpb=1000, num_updates=12000, accuracy=18.9, loss=6.04
Train: wpb=1000, num_updates=13000, accuracy=19.2, loss=6.00
Train: wpb=1000, num_updates=14000, accuracy=19.5, loss=5.96
Train: wpb=1000, num_updates=15000, accu

In [22]:
# 'El Periodico' test dataset
valid_x_df = pd.read_csv(f'{COMPETITION_ROOT}/x_test.csv')
test_x = valid_x_df[tokens].apply(vocab.get_index).to_numpy(dtype='int32')
y_pred = validate(model, None, test_x, None, params.batch_size, device)
y_token = [vocab.idx2token[index] for index in y_pred]

In [23]:
submission = pd.DataFrame({'id':valid_x_df['id'], 'token': y_token}, columns=['id', 'token'])
print(submission.head())
submission.to_csv('submission.csv', index=False)

   id      token
0   0      <unk>
1   1      <unk>
2   2         la
3   3      <unk>
4   4  Assemblea
