In [1]:
import argparse
import gzip
import csv
import json
import logging
import mlflow
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

from sklearn.metrics import balanced_accuracy_score
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm, trange
from torch.utils.data import IterableDataset

from gensim import corpora
from gensim.parsing import preprocessing

logging.basicConfig(
    format="%(asctime)s: %(levelname)s - %(message)s",
    level=logging.INFO
)


In [2]:
class PadSequences:
    def __init__(self, pad_value=0, max_length=None, min_length=1):
        assert max_length is None or min_length <= max_length
        self.pad_value = pad_value
        self.max_length = max_length
        self.min_length = min_length

    def __call__(self, items):
        data = [item["data"] for item in items]
        target = [item["target"] for item in items]
        seq_lengths = [len(d) for d in data]

        if self.max_length:
            max_length = self.max_length
            seq_lengths = [min(self.max_length, l) for l in seq_lengths]
        else:
            max_length = max(self.min_length, max(seq_lengths))

        data = [d[:l] + [self.pad_value] * (max_length - l)
                for d, l in zip(data, seq_lengths)]

        return {
            "data": torch.LongTensor(data),
            "target": torch.LongTensor(target)
        }

In [3]:
class MeliChallengeDataset(IterableDataset):
    def __init__(self,
                 dataset_path,
                 random_buffer_size=2048):
        assert random_buffer_size > 0
        self.dataset_path = dataset_path
        self.random_buffer_size = random_buffer_size

        with gzip.open(self.dataset_path, "rt") as dataset:
            item = json.loads(next(dataset).strip())
            self.n_labels = item["n_labels"]
            self.dataset_size = item["size"]

    def __len__(self):
        return self.dataset_size

    def __iter__(self):
        try:
            with gzip.open(self.dataset_path, "rt") as dataset:
                shuffle_buffer = []

                for line in dataset:
                    item = json.loads(line.strip())
                    item = {
                        "data": item["data"],
                        "target": item["target"]
                    }
                    
                    self.item = item

                    if self.random_buffer_size == 1:
                        yield item
                    else:
                        shuffle_buffer.append(item)

                        if len(shuffle_buffer) == self.random_buffer_size:
                            random.shuffle(shuffle_buffer)
                            for item in shuffle_buffer:
                                yield item
                            shuffle_buffer = []

                if len(shuffle_buffer) > 0:
                    random.shuffle(shuffle_buffer)
                    for item in shuffle_buffer:
                        yield item
        except GeneratorExit:
            return

In [4]:
class MLPClassifier(nn.Module):
    def __init__(self,
                 pretrained_embeddings_path,
                 token_to_index,
                 n_labels,
                 hidden_layers=[256, 128],
                 dropout=0.3,
                 vector_size=300,
                 freeze_embedings=True):
        super().__init__()
        with gzip.open(token_to_index, "rt") as fh:
            token_to_index = json.load(fh)
        embeddings_matrix = torch.randn(len(token_to_index), vector_size)
        embeddings_matrix[0] = torch.zeros(vector_size)
        with gzip.open(pretrained_embeddings_path, "rt") as fh:
            next(fh)
            for line in fh:
                word, vector = line.strip().split(None, 1)
                if word in token_to_index:
                    embeddings_matrix[token_to_index[word]] =\
                        torch.FloatTensor([float(n) for n in vector.split()])
        self.embeddings = nn.Embedding.from_pretrained(embeddings_matrix,
                                                       freeze=freeze_embedings,
                                                       padding_idx=0)
        self.hidden_layers = [
            nn.Linear(vector_size, hidden_layers[0])
        ]
        for input_size, output_size in zip(hidden_layers[:-1], hidden_layers[1:]):
            self.hidden_layers.append(
                nn.Linear(input_size, output_size)
            )
        self.dropout = dropout
        self.hidden_layers = nn.ModuleList(self.hidden_layers)
        self.output = nn.Linear(hidden_layers[-1], n_labels)
        self.vector_size = vector_size

    def forward(self, x):
        x = self.embeddings(x)
        x = torch.mean(x, dim=1)
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
            if self.dropout:
                x = F.dropout(x, self.dropout)
        x = self.output(x)
        return x

In [9]:
class RNNClassifier(nn.Module):
    def __init__(self,
                 pretrained_embeddings_path,
                 token_to_index,
                 n_labels,
                 hidden_layer=128,
                 dropout=0.3,
                 vector_size=300,
                 num_layers=1,
                 bias=True,
                 bidirectional=False,
                 freeze_embedings=True):
        super().__init__()

        with gzip.open(token_to_index, "rt") as fh:
            token_to_index = json.load(fh)
        embeddings_matrix = torch.randn(len(token_to_index), vector_size)
        embeddings_matrix[0] = torch.zeros(vector_size)
        with gzip.open(pretrained_embeddings_path, "rt") as fh:
            next(fh)
            for line in fh:
                word, vector = line.strip().split(None, 1)
                if word in token_to_index:
                    embeddings_matrix[token_to_index[word]] =\
                        torch.FloatTensor([float(n) for n in vector.split()])
        self.embeddings = nn.Embedding.from_pretrained(embeddings_matrix,
                                                       freeze=freeze_embedings,
                                                       padding_idx=0)
        # Set our LSTM parameters
        self.lstm_config = {'input_size': vector_size,
                            'hidden_size': hidden_layer,
                            'num_layers': num_layers,
                            'bias': bias,
                            'batch_first': True,
                            'dropout': dropout,
                            'bidirectional': bidirectional}
        if num_layers == 1: self.lstm_config['dropout']=0
        # Set our FC layer parameters
        self.linear_config = {'in_features': hidden_layer,
                              'out_features': n_labels,
                              'bias': bias}
        
        # Instanciate the layers
        self.encoder = nn.LSTM(**self.lstm_config)
        self.output = nn.Linear(hidden_layer, n_labels)
        self.vector_size = vector_size

    def forward(self, x):
        x = self.embeddings(x)
        outputs, (ht, ct) = self.encoder(x)
        predictions = self.output(ht[-1])
        return predictions

In [None]:
class CNNClassifier(nn.Module):
    def __init__(self, 
                 pretrained_embeddings_path, 
                 token_to_index,
                 vector_size,
                 n_labels,
                 filter_count=100,
                 filters_lenght=[2,3,4],
                 freeze_embedings=True):
        super().__init__()
        with gzip.open(token_to_index, "rt") as fh:
            token_to_index = json.load(fh)
        embeddings_matrix = torch.randn(len(token_to_index), vector_size)
        embeddings_matrix[0] = torch.zeros(vector_size)
        with gzip.open(pretrained_embeddings_path, "rt") as fh:
            next(fh)
            for line in fh:
                word, vector = line.strip().split(None, 1)
                if word in token_to_index:
                    embeddings_matrix[token_to_index[word]] =\
                        torch.FloatTensor([float(n) for n in vector.split()])
        self.embeddings = nn.Embedding.from_pretrained(embeddings_matrix,
                                                       freeze=freeze_embedings,
                                                       padding_idx=0)
        self.convs = []
        for filter_lenght in filters_length:
            self.convs.append(
                nn.Conv1d(vector_size, filters_count, filter_lenght)
            )
        self.convs = nn.ModuleList(self.convs)
        self.fc = nn.Linear(filters_count * len(filters_length), 128)
        self.output = nn.Linear(128, n_labels)
        self.vector_size = vector_size
    
    @staticmethod
    def conv_global_max_pool(x, conv):
        return F.relu(conv(x).transpose(1, 2).max(1)[0])
    
    def forward(self, x):
        x = self.embeddings(x).transpose(1, 2)  # Conv1d takes (batch, channel, seq_len)
        x = [self.conv_global_max_pool(x, conv) for conv in self.convs]
        x = torch.cat(x, dim=1)
        x = F.relu(self.fc(x))
        x = torch.sigmoid(self.output(x))
        return x

In [5]:
train_data='./data/meli-challenge-2019/spanish.train.jsonl.gz'
test_data=''
token_to_index='./data/meli-challenge-2019/spanish_token_to_index.json.gz'
pretrained_embeddings='./data/SBW-vectors-300-min5.txt.gz'
language='spanish'
validation_data='./data/meli-challenge-2019/spanish.validation.jsonl.gz'
embeddings_size=300
hidden_layer=64
num_layers=1
dropout=0.3
epochs=1
bidirectional=True
batch_size=128
learning_rate=0.001
weight_decay=0.005

In [6]:
pad_sequences = PadSequences(
    pad_value=0,
    max_length=None,
    min_length=1
)

logging.info("Building training dataset")
train_dataset = MeliChallengeDataset(
    dataset_path=train_data,
    random_buffer_size=2048  # This can be a hypterparameter
    )

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,  # This can be a hyperparameter
    shuffle=False,
    collate_fn=pad_sequences,
    drop_last=False
    )

2021-03-20 20:00:14,316: INFO - Building training dataset


In [7]:
if validation_data:
    logging.info("Building validation dataset")
    validation_dataset = MeliChallengeDataset(
        dataset_path=validation_data,
        random_buffer_size=1
    )
    validation_loader = DataLoader(
        validation_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=pad_sequences,
        drop_last=False
    )
else:
    validation_dataset = None
    validation_loader = None

if test_data:
    logging.info("Building test dataset")
    test_dataset = MeliChallengeDataset(
        dataset_path=test_data,
        random_buffer_size=1
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=pad_sequences,
        drop_last=False
    )
else:
    test_dataset = None
    test_loader = None

2021-03-20 20:00:14,968: INFO - Building validation dataset


In [9]:
logging.info("Starting experiment")
# Log all relevent hyperparameters

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

logging.info("Building classifier")
model = MLPClassifier(
    pretrained_embeddings_path=pretrained_embeddings,
    token_to_index=token_to_index,
    n_labels=train_dataset.n_labels,
    hidden_layers=hidden_layers,
    dropout=dropout,
    vector_size=embeddings_size,
    freeze_embedings=True  # This can be a hyperparameter
)
model = model.to(device)
logging.info(model)
loss = nn.CrossEntropyLoss()
logging.info(loss)
optimizer = optim.Adam(
    model.parameters(),
    lr=learning_rate,  # This can be a hyperparameter
    weight_decay=weight_decay  # This can be a hyperparameter
)

2021-03-19 14:27:08,497: INFO - Starting experiment
2021-03-19 14:27:08,531: INFO - Building classifier


KeyboardInterrupt: 

In [10]:
logging.info("Starting experiment")
# Log all relevent hyperparameters
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

logging.info("Building classifier")
model = RNNClassifier(
    pretrained_embeddings_path=pretrained_embeddings,
    token_to_index=token_to_index,
    n_labels=train_dataset.n_labels,
    hidden_layer=hidden_layer,
    num_layers=num_layers,
    dropout=dropout,
    bidirectional=bidirectional,
    vector_size=embeddings_size,
    freeze_embedings=True  # This can be a hyperparameter
)
model = model.to(device)
logging.info(model)
loss = nn.CrossEntropyLoss()
logging.info(loss)
optimizer = optim.Adam(
    model.parameters(),
    lr=learning_rate,  # This can be a hyperparameter
    weight_decay=weight_decay  # This can be a hyperparameter
)

2021-03-20 20:01:47,840: INFO - Starting experiment
2021-03-20 20:01:47,842: INFO - Building classifier
2021-03-20 20:02:17,499: INFO - RNNClassifier(
  (embeddings): Embedding(50002, 300, padding_idx=0)
  (encoder): LSTM(300, 64, batch_first=True, bidirectional=True)
  (output): Linear(in_features=64, out_features=632, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)
2021-03-20 20:02:17,501: INFO - CrossEntropyLoss()


In [11]:
logging.info("Training classifier")
model.train()
for epoch in trange(epochs):
    running_loss = []
    for idx, batch in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        data = batch["data"].to(device)
        target = batch["target"].to(device)
        output = model(data)
#        if idx % 1000 == 0 or idx == 1:
#            logging.info(
#                f" data{data.shape}, target: {target.shape}, output: {output.shape}"
#            )
        loss_value = loss(output, target)
        loss_value.backward()
        optimizer.step()
        running_loss.append(loss_value.item())

2021-03-20 20:02:38,883: INFO - Training classifier


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/38245 [00:00<?, ?it/s]

In [12]:
if validation_dataset:
    logging.info("Evaluating model on validation")
    model.eval()
    running_loss = []
    targets = []
    predictions = []
    with torch.no_grad():
        for batch in tqdm(validation_loader):
            data = batch["data"].to(device)
            target = batch["target"].to(device)
            output = model(data)
            running_loss.append(
                loss(output, target).item()
            )
            targets.extend(batch["target"].numpy())
            predictions.extend(output.argmax(axis=1).detach().cpu().numpy())
        validation_loss=sum(running_loss) / len(running_loss)
        validation_bacc=balanced_accuracy_score(targets, predictions)

2021-03-20 20:07:29,336: INFO - Evaluating model on validation


  0%|          | 0/9562 [00:00<?, ?it/s]

In [13]:
validation_bacc

0.0284787952652879

In [20]:
validation_loss

6.31628486427446

In [88]:
running_loss

[190.46923828125,
 155.0888671875,
 134.0845947265625,
 119.7809066772461,
 136.447998046875,
 116.58866882324219,
 152.62648010253906,
 140.319091796875,
 138.42308044433594,
 109.41680145263672,
 170.3216094970703,
 118.48246765136719,
 143.74794006347656,
 157.5228729248047,
 157.9416961669922,
 171.3623809814453,
 149.07916259765625,
 139.8533935546875,
 109.84098052978516,
 119.46031188964844,
 137.6290283203125,
 164.5352783203125,
 135.20913696289062,
 133.97030639648438,
 125.65333557128906,
 113.91716003417969,
 178.70547485351562,
 126.47895812988281,
 124.00418853759766,
 176.83006286621094,
 135.4651641845703,
 134.6033935546875,
 171.90615844726562,
 158.7423095703125,
 198.43368530273438,
 146.71197509765625,
 143.58074951171875,
 165.2445068359375,
 173.45611572265625,
 150.12811279296875,
 177.45472717285156,
 183.646728515625,
 176.5774688720703,
 117.40399169921875,
 153.33241271972656,
 144.76148986816406,
 155.33474731445312,
 151.03268432617188,
 172.53758239746094