# Winograd Schema Challenge for Spanish 

TO DO
- Data prep:
    - create sentences
    - keep correct pronoun
    - separate train/test?
- LM
    - create LSTM or use BERT?
    - calculate prob per sentence

In [13]:
pip install --user torchtext

Collecting torchtext
  Downloading torchtext-0.11.0-cp39-cp39-manylinux1_x86_64.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 4.4 MB/s eta 0:00:01
Collecting torch==1.10.0
  Downloading torch-1.10.0-cp39-cp39-manylinux1_x86_64.whl (881.9 MB)
[K     |████████████████████████████████| 881.9 MB 590 bytes/s a 0:00:01    |█                               | 29.9 MB 14.7 MB/s eta 0:00:58
Installing collected packages: torch, torchtext
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.10.0+cu111 requires torch==1.9.0, but you have torch 1.10.0 which is incompatible.
torchaudio 0.9.0 requires torch==1.9.0, but you have torch 1.10.0 which is incompatible.[0m
Successfully installed torch-1.10.0 torchtext-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from torch.utils.data import DataLoader
from torchtext.legacy.data import Field, BucketIterator, TabularDataset

# torchtext.legacy.data.Field

import os
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
device = torch.device('cpu')

## Data preparation

### Preparing Wikicorpus sentences

In [8]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize
# sent_tokenize(s)

f = open('data/prova.txt', 'r')
l = f.readlines()
f.close()

endtokens = '.?!:'

g = open('sentences.csv', 'w')
s = ''
for line in l:
    if line[-2] in endtokens:
        s += line[:-1] + '\n'
        g.write(s)
        s = ''
    else:
        s+= line[:-1]
g.close()

Run processing_wiki_data.py to get "wiki_sentences.txt". It contains X sentences.

In [3]:
def get_data(wiki_path):
    whitespacer = lambda x: x.split(' ')
    
    WORDS = Field(tokenize    = whitespacer,
                lower       = True,
                batch_first = True,
                init_token='<start>',
                eos_token='<end>') 
    
    # read the csv file
    wikipedia = TabularDataset(path = wiki_path, # wiki_sent.csv
                            format = 'csv',
                            fields = [('sentence', WORDS)],
                            skip_header       = True,
                            csv_reader_params = {'quotechar':'Ö'}) 
    
    # build vocabularies based on what our csv files contained and create word2id mapping
    WORDS.build_vocab(wikipedia)

    # create batches from our data, and shuffle them for each epoch
    wikipedia_iter = BucketIterator(wikipedia,
                                  batch_size        = 8,
                                  sort_within_batch = True,
                                  sort_key          = lambda x: len(x.sentence),
                                  shuffle           = True,
                                  device            = device)

    return wikipedia_iter, WORDS

### Preparing WCS sentences

In [17]:
data_un = '/home/gusgraupa@GU.GU.SE/MLNLP2/LT2326-Project/data/Dataset_with_pn_undescores_det.txt'

In [18]:
def sentence_creation(filename):
    f = open(filename, 'r')
    l = f.readlines()
    
    grouped = []
    j = []
    for e in l:
        if e == '\n':
            grouped.append(j)
            j = []
        else:
            j.append(e[:-1])
            
    sentences = []
    
    for group in grouped:
        
        if group[1] == 'su' or group[1] == 'sus':
            sentence = group[0][:-1]
            words = sentence.split()
            for i, w in enumerate(words):
                if w == '_' + group[1] + '_': 
                    words.remove(w)
                    words_copy = words.copy()
                    words.insert(i+1, 'de ' + group[2])
                    words_copy.insert(i+1, 'de ' + group[3])
                    
                    s1 = ' '.join(words) + '.'
                    s2 = ' '.join(words_copy) + '.'
                    
                    g1 = (s1.lower(), group[2] == group[4])
                    g2 = (s2.lower(), group[3] == group[4])
        
        else:
            check = '_' + group[1] + '_'
            
            s1 = group[0].replace(check, group[2])
            s2 = group[0].replace(check, group[3])
            
            g1 = (s1.lower(), group[2] == group[4])
            g2 = (s2.lower(), group[3] == group[4])
        sentences.append((g1, g2))
        
    f.close()
    return sentences

In [19]:
sentences = sentence_creation(data_un)
sentences[0]

(('los concejales de la ciudad denegaron el permiso a los manifestantes porque los concejales de la ciudad temían la violencia.',
  True),
 ('los concejales de la ciudad denegaron el permiso a los manifestantes porque los manifestantes temían la violencia.',
  False))

## Language Model

In [5]:
epochs = 5
batch_size = 2
learning_rate = 0.001
embedding_size = 256
hidden_size = 128

In [6]:
class LSTM(nn.Module):
    def __init__(self, vocab_len, embedding_size, hidden_size):

        super(LSTM, self).__init__()
        
        self.vocab_len = vocab_len
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_len, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.fc2 = nn.Linear(hidden_size//2, vocab_len)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):

        embeds = self.embedding(x)
        out, hidden = self.lstm(embeds)
        fc1_out = self.fc1(out)
        fc2_out = self.fc2(fc1_out)
        output = self.softmax(fc2_out)

        return output

In [6]:
# training function

def train(path, epochs, batch_size, learning_rate, embedding_size, hidden_size, device):
    
    # loading the data
    dataset, vocab = get_data(path)
    
    model = LSTM(len(vocab.vocab), embedding_size, hidden_size)
    model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # start training loop
    model.train()
    total_loss = 0
    
    for epoch in range(epochs):
        
        for i, batch in enumerate(dataset):

            # the strucure for each BATCH is:
            # <start>, w0, ..., wn, <end>
            sentence = batch.sentence

            # we do not want to give <end> as input to the model
            input_sentence = sentence[:, :-1]

            # send to model
            output = model(input_sentence)

            # we select all but the first token from sentences
            target = sentence[:, 1:]
            
#             print(output.size()) 8, 11, 360000
#             print(target.size()) 8, 11
            
            loss = loss_fn(output.permute(0, 2, 1), target)
            total_loss += loss.item()

            # print average loss for the epoch
            print(total_loss/(i+1), end='\r') 

            # compute gradients
            loss.backward()

            # update parameters
            optimizer.step()

            # reset gradients
            optimizer.zero_grad()

        print()
    
    return model

In [11]:
# with mini_wiki_sent.csv

model_mini = train('mini_wiki_sent.csv', epochs, batch_size, learning_rate, embedding_size, hidden_size, device)

# saving the model
torch.save(model, 'mini_lstm_model.pt')

7.2722037694549565
13.598793445682526
19.459904922142038
24.917827505397796
30.003621889743805


In [10]:
# with medium_wiki_sent.csv
model_medium = train('medium_wiki_sent.csv', epochs, batch_size, learning_rate, embedding_size, hidden_size, device)

7.4633059020423895
14.158298510017396
20.546251615085602
26.662729534435277
32.530420682411295


In [11]:
# saving the model
torch.save(model_medium, 'lstm_model_medium.pt')

In [7]:
# with new_dataset
model_book = train('sentences.csv', epochs, batch_size, learning_rate, embedding_size, hidden_size, device)

6.6702164727208035

RuntimeError: CUDA out of memory. Tried to allocate 2.82 GiB (GPU 3; 10.92 GiB total capacity; 8.82 GiB already allocated; 458.38 MiB free; 8.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# saving the model
torch.save(model_book, 'lstm_model_book.pt')

In [7]:
model_medium = torch.load('lstm_model_medium.pt')

In [8]:
dataset, vocab = get_data('medium_wiki_sent.csv')

In [11]:
model_medium.to(device)

LSTM(
  (embedding): Embedding(122593, 256)
  (lstm): LSTM(256, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=122593, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [26]:
dataset, vocab = get_data('usable_data.csv')

In [16]:
dataset, vocab = get_data('sentences.csv')

In [17]:
len(vocab.vocab)

61781

In [25]:
len(vocab.vocab)

191273

In [23]:
len(vocab.vocab)

109829

In [34]:
t = torch.rand(10)
ts = F.softmax(t, dim=0)
ts

tensor([0.1136, 0.0879, 0.1016, 0.0928, 0.0709, 0.0689, 0.0643, 0.1664, 0.1408,
        0.0930])

In [35]:
max(ts)

tensor(0.1664)

In [37]:
t2 = torch.rand(1, 2, 10)
t2

tensor([[[0.8401, 0.4189, 0.7460, 0.2897, 0.0427, 0.0213, 0.3599, 0.6085,
          0.1460, 0.2513],
         [0.4996, 0.6992, 0.2462, 0.3649, 0.7476, 0.2809, 0.4466, 0.8648,
          0.9327, 0.2741]]])

In [38]:
ts2 = F.softmax(t2, dim=2)
ts2

tensor([[[0.1538, 0.1009, 0.1400, 0.0887, 0.0693, 0.0678, 0.0952, 0.1220,
          0.0768, 0.0854],
         [0.0936, 0.1143, 0.0726, 0.0818, 0.1199, 0.0752, 0.0888, 0.1348,
          0.1443, 0.0747]]])

In [43]:
a = torch.max(ts2, dim=2)
a.values

tensor([[0.1538, 0.1443]])

In [44]:
torch.sum(a.values)

tensor(0.2981)

In [45]:
def getting_prob(sentences, vocab, model):
    probs = []
    n = 0
    tot = 0
    
    for sent in sentences: # ((Sent1, True), (Sent2, False))
        tot += 1
        for pair in sent:
            if pair[1]:
                true_sent = pair[0]
            else:
                false_sent = pair[0]

        # tokenizing sent
        tok_true_sent = true_sent.split()
        tok_false_sent = false_sent.split()

        # encoding
        enc_true_sent = torch.tensor([vocab.vocab.stoi[x] for x in tok_true_sent], device=device)
        enc_false_sent = torch.tensor([vocab.vocab.stoi[x] for x in tok_false_sent], device=device)

        # model
#         out_true_sent = model(enc_true_sent[:len(enc_true_sent)-1].unsqueeze(0))
#         out_false_sent = model(enc_false_sent[:len(enc_false_sent)-1].unsqueeze(0)
        out_true_sent = model(enc_true_sent.unsqueeze(0))
        out_false_sent = model(enc_false_sent.unsqueeze(0))
#         print(out_true_sent.size())

        # get probabilities 
        true_prob = F.softmax(out_true_sent, dim=2)
        false_prob = F.softmax(out_false_sent, dim=2)

        # append
        true_max = torch.max(true_prob, dim=2)
        false_max = torch.max(false_prob, dim=2)
        
        true_sent_prob = torch.sum(true_max.values)
        false_sent_prob = torch.sum(false_max.values)
        
        probs.append((true_sent_prob, false_sent_prob))
        n += int(true_sent_prob > false_sent_prob)
        
#         probs.append((true_prob[0,0,:].sum(), false_prob[0,0,:].sum()))
#         n += int(true_prob[0,0,:].sum()>false_prob[0,0,:].sum())
    
    print('Higher prob of correct cases in:', n/tot)
    return probs

In [46]:
probs = getting_prob(sentences, vocab, model_medium)

Higher prob of correct cases in: 0.4111111111111111


In [47]:
probs

[(tensor(6.8502, grad_fn=<SumBackward0>),
  tensor(5.6651, grad_fn=<SumBackward0>)),
 (tensor(6.3412, grad_fn=<SumBackward0>),
  tensor(7.4655, grad_fn=<SumBackward0>)),
 (tensor(4.0114, grad_fn=<SumBackward0>),
  tensor(4.4789, grad_fn=<SumBackward0>)),
 (tensor(4.4789, grad_fn=<SumBackward0>),
  tensor(4.0114, grad_fn=<SumBackward0>)),
 (tensor(2.8121, grad_fn=<SumBackward0>),
  tensor(2.9396, grad_fn=<SumBackward0>)),
 (tensor(3.1061, grad_fn=<SumBackward0>),
  tensor(3.0802, grad_fn=<SumBackward0>)),
 (tensor(3.7711, grad_fn=<SumBackward0>),
  tensor(3.4839, grad_fn=<SumBackward0>)),
 (tensor(3.4839, grad_fn=<SumBackward0>),
  tensor(3.7711, grad_fn=<SumBackward0>)),
 (tensor(4.0724, grad_fn=<SumBackward0>),
  tensor(3.9941, grad_fn=<SumBackward0>)),
 (tensor(3.9941, grad_fn=<SumBackward0>),
  tensor(4.0724, grad_fn=<SumBackward0>)),
 (tensor(6.2339, grad_fn=<SumBackward0>),
  tensor(4.9421, grad_fn=<SumBackward0>)),
 (tensor(4.9421, grad_fn=<SumBackward0>),
  tensor(6.2339, grad_f