In [17]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as fn
import torch.optim as optim

from collections import OrderedDict

import mlflow
import gzip
import tempfile

from tqdm.notebook import tqdm, trange
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import average_precision_score
from torch.nn import DataParallel

In [2]:
QTYPES = ['<color>', '<shape>', '<spatial>', 'other', '<action>', 'object', '<texture>', '<size>']

In [3]:
class GWQtypeDataset(Dataset):
    def __init__(self, file_path, embs, unk_emb, successful_only=False):
        self.dataset = pd.read_csv(file_path)
        self.dataset['qtype'].fillna('<other>', inplace=True)
        self.dataset['question'] = self.dataset['question'].apply(lambda x: x.strip().lower())
        self.embs = embs
        self.unk_emb = unk_emb
        
    def vectorize_cat(self, x):
        qtype = x[1:-1].replace('\'', '').split(', ')
        if qtype == 'super-category':
            qtype = 'object'
        # Marco con 1 los qtypes que sí están
        qtype_vector = [int(q in qtype) for q in QTYPES]
        return qtype_vector
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        datapoint = self.dataset.iloc[idx]
        qtype = self.vectorize_cat(datapoint.qtype)
        words = datapoint.question.split()
        sent_embs = [self.embs[w] if w in self.embs.keys() else self.unk_emb for w in words]
        new_entry = np.array(sent_embs).sum(axis=0)

        output = {'qid': datapoint.question_id,
                  'question': np.array(sent_embs).sum(axis=0),
                  'qtype': np.asarray(qtype)}
        
        return output

In [4]:
def preprocess_data(corpus, embeddings_dict, unk_emb):
    new_data = []
    vocab = []
    skipped = []
    for i, entry in enumerate(corpus):
        words = entry.strip().split()
        if len(words) < 1:
            print('WARNING: skipped empty question')
            skipped.append(i)
            continue
        # remove the last ? sign
        last = words[-1]
        words = words[:-1] + [last[:-1], '?']
        # append tokens to vocabulary
        vocab += words
        # form new data
        sent_embs = [embeddings_dict[w] if w in embeddings_dict.keys() else unk_emb for w in words]
        new_entry = np.array(sent_embs).sum(axis=0)
        new_data.append(new_entry)
    return np.array(new_data), skipped

In [5]:
PRETRAINED_EMBS = "glove.6B.50d.txt.gz"
embeddings_dict_50d = {}
with gzip.open(PRETRAINED_EMBS, "rt") as fh:
    for line in fh:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict_50d[word] = vector
UNK_EMB_50d = np.random.rand(50)

In [9]:
embeddings_dict_50d['<UNK>'] = UNK_EMB_50d

In [6]:
class QtypeClassifierA(nn.Module):
    """
    Uses the best config from the Clustering experiments
    """
    def __init__(self):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(50, 50),
            nn.ReLU(),
            nn.Linear(50, 8),
        )
        
    def forward(self, data):
        output = self.mlp(data)
        if not self.training:
            output = torch.sigmoid(output)
        return output

In [7]:
train_data = GWQtypeDataset('guesswhat.train.csv.gz', embeddings_dict_50d, UNK_EMB_50d, False)
valid_data = GWQtypeDataset('guesswhat.valid.csv.gz', embeddings_dict_50d, UNK_EMB_50d, False)

In [8]:
dataloader = DataLoader(dataset=train_data,
                        batch_size=256,
                        shuffle=True)
dataloader_valid = DataLoader(dataset=valid_data,
                              batch_size=256,
                              shuffle=True)

mlflow.set_experiment("ClassifierA")

with mlflow.start_run():
    mlflow.log_param("model_name", "mlp_best_feature")
    mlflow.log_params({
        "embedding": 'aggregated_glove50d',
    })
    model = QtypeClassifierA()
    lossfn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    for epoch in trange(5):
        model.train()
        running_loss = []
        for idx, batch in enumerate(tqdm(dataloader)):
            optimizer.zero_grad()
            output = model(batch["question"].float())
            loss_value = lossfn(output, batch["qtype"].float())
            loss_value.backward()
            optimizer.step()
            running_loss.append(loss_value.item())        
        mlflow.log_metric("train_loss", sum(running_loss) / len(running_loss), epoch)
        
        model.eval()
        running_loss = []
        targets = []
        predictions = []
        for batch in tqdm(dataloader_valid):
            output = model(batch["question"].float())
            running_loss.append(
                lossfn(output, batch["qtype"].float()).item()
            )
            targets.extend(batch["qtype"].numpy())
            predictions.extend(output.squeeze().detach().numpy())
        mlflow.log_metric("validation_loss", sum(running_loss) / len(running_loss), epoch)
        mlflow.log_metric("validation_avp", average_precision_score(targets, predictions), epoch)

2021/12/05 20:48:32 INFO mlflow.tracking.fluent: Experiment with name 'ClassifierA' does not exist. Creating a new experiment.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

In [9]:
with tempfile.TemporaryDirectory() as tmpdirname:
    targets = []
    predictions = []
    question_ids = []
    for batch in tqdm(dataloader_valid):
        output = model(batch["question"].float())
        targets.extend(batch["qtype"].float().numpy())
        predictions.extend(output.squeeze().detach().numpy())
        question_ids.extend(batch["qid"].numpy())
    pd.DataFrame({"prediction": predictions, "target": targets}).to_csv(
        f"{tmpdirname}/predictions.csv.gz", index=False
    )
    mlflow.log_artifact(f"{tmpdirname}/predictions.csv.gz")

  0%|          | 0/470 [00:00<?, ?it/s]

## Second approach

In [140]:
class GWQtypeDatasetRNN(Dataset):
    def __init__(self, file_path, vocab, max_len=20, successful_only=False):
        self.dataset = pd.read_csv(file_path)
        self.dataset['qtype'].fillna('<other>', inplace=True)
        self.dataset['question'] = self.dataset['question'].apply(lambda x: x.strip().lower())
        self.vocab = vocab
        self.max_len = max_len
        
    def vectorize_cat(self, x):
        qtype = x[1:-1].replace('\'', '').split(', ')
        if qtype == 'super-category':
            qtype = 'object'
        # Marco con 1 los qtypes que sí están
        qtype_vector = [int(q in qtype) for q in QTYPES]
        return qtype_vector
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        tok2id = self.vocab['tok2id']
        datapoint = self.dataset.iloc[idx]
        qtype = self.vectorize_cat(datapoint.qtype)
        words = datapoint.question.split()
        sent_embs = [tok2id[w] if w in tok2id.keys() else tok2id['UNK'] for w in words]
        if len(sent_embs) < self.max_len:
            sent_embs.extend([tok2id['PADDING'] for _ in range(self.max_len - len(sent_embs))])
        elif len(sent_embs) > self.max_len:
            sent_embs = sent_embs[:self.max_len]
            
        assert(len(sent_embs) == self.max_len)
        output = {'qid': datapoint.question_id,
                  'question': np.array(sent_embs),
                  'qtype': np.array(qtype)}
        return output

In [20]:
PRETRAINED_EMBS = "glove.6B.50d.txt.gz"
embeddings_dict_50d = OrderedDict()
embeddings_dict_50d['PADDING'] = np.zeros(50)
embeddings_dict_50d['UNK'] = np.random.rand(50)
with gzip.open(PRETRAINED_EMBS, "rt") as fh:
    for line in fh:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict_50d[word] = vector

In [86]:
embeddings_dict_50d['PADDING'] = np.zeros(50).astype(np.float32)
embeddings_dict_50d['UNK'] = np.random.rand(50).astype(np.float32)

In [91]:
vocab = {'tok2id': {k:i for i,k in enumerate(embeddings_dict_50d)},
         'id2tok': {i:k for i,k in enumerate(embeddings_dict_50d)}}

emb_matrix = np.array(list(embeddings_dict_50d.values()))

In [141]:
train_data = GWQtypeDatasetRNN('guesswhat.train.csv.gz', vocab)
valid_data = GWQtypeDatasetRNN('guesswhat.valid.csv.gz', vocab)

In [10]:
class QtypeClassifierB(nn.Module):
    """
    Uses an GRU and word embeddings for question classification
    """
    def __init__(self, emb_matrix):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(emb_matrix),
                                                      freeze=True,
                                                      padding_idx=0)
        self.rnn = nn.GRU(50, 128, batch_first=True)
        self.out_layer = nn.Linear(128, 8)
        
    def forward(self, data):
        x = self.embedding(data)
        gru_out, _ = self.rnn(x)
        # Me quedo solo con el último estado oculto
        output = self.out_layer(fn.relu(gru_out[:,-1,:]))
        if not self.training:
            output = torch.sigmoid(output)
        return output

In [14]:
model.cuda()
model = DataParallel(model)

QtypeClassifierB(
  (embedding): Embedding(400002, 50, padding_idx=0)
  (rnn): GRU(50, 128, batch_first=True)
  (out_layer): Linear(in_features=128, out_features=8, bias=True)
)

In [21]:
dataloader = DataLoader(dataset=train_data,
                        batch_size=256,
                        shuffle=True,
                        pin_memory=True)
dataloader_valid = DataLoader(dataset=valid_data,
                              batch_size=256,
                              shuffle=True,
                              pin_memory=True)

mlflow.set_experiment("ClassifierB")

with mlflow.start_run():
    mlflow.log_param("model_name", "RNN_based")
    mlflow.log_params({
        "embedding": 'glove50d',
    })
    # instanciate model
    model = QtypeClassifierB(emb_matrix)
    model.cuda()
    model = DataParallel(model)
    lossfn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    for epoch in trange(5):
        model.train()
        running_loss = []
        for idx, batch in enumerate(tqdm(dataloader)):
            #if idx > 5:
            #    break
            optimizer.zero_grad()
            output = model(batch["question"])
            loss_value = lossfn(output, batch["qtype"].float().cuda())
            loss_value.backward()
            optimizer.step()
            running_loss.append(loss_value.item())        
        mlflow.log_metric("train_loss", sum(running_loss) / len(running_loss), epoch)
        
        model.eval()
        running_loss = []
        targets = []
        predictions = []
        for idx, batch in enumerate(tqdm(dataloader_valid)):
            #if idx > 5:
            #    break
            output = model(batch["question"])
            running_loss.append(
                lossfn(output, batch["qtype"].float().cuda()).item()
            )
            targets.extend(batch["qtype"].numpy())
            predictions.extend(output.squeeze().cpu().detach().numpy())
        mlflow.log_metric("validation_loss", sum(running_loss) / len(running_loss), epoch)
        mlflow.log_metric("validation_avp", average_precision_score(targets, predictions), epoch)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

  0%|          | 0/2265 [00:00<?, ?it/s]

  0%|          | 0/470 [00:00<?, ?it/s]

In [25]:
with tempfile.TemporaryDirectory() as tmpdirname:
    targets = []
    predictions = []
    question_ids = []
    for batch in tqdm(dataloader_valid):
        output = model(batch["question"])
        targets.extend(batch["qtype"].float().cpu().numpy())
        predictions.extend(output.squeeze().detach().cpu().numpy())
        question_ids.extend(batch["qid"].cpu().numpy())
    pd.DataFrame({"prediction": predictions, "target": targets}).to_csv(
        f"{tmpdirname}/predictions.csv.gz", index=False
    )
    mlflow.log_artifact(f"{tmpdirname}/predictions.csv.gz")

  0%|          | 0/470 [00:00<?, ?it/s]

In [23]:
torch.save(model.state_dict(), 'bin/gru_model')

In [29]:
at = targets[0]
pt = predictions[0]
print(targets[0])
print(predictions[0])

[0. 0. 0. 0. 0. 1. 0. 0.]
[1.96749647e-03 1.07740794e-04 7.28981046e-04 6.11157231e-02
 1.74934219e-03 4.88175005e-01 7.56054127e-04 7.72921994e-05]


In [30]:
average_precision_score(at, pt)

1.0