                                                                                            gusgraupa

# LT2326 - Project

## Winograd Schema Challenge for Spanish: sentence probability

In [13]:
pip install --user torchtext

Collecting torchtext
  Downloading torchtext-0.11.0-cp39-cp39-manylinux1_x86_64.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 4.4 MB/s eta 0:00:01
Collecting torch==1.10.0
  Downloading torch-1.10.0-cp39-cp39-manylinux1_x86_64.whl (881.9 MB)
[K     |████████████████████████████████| 881.9 MB 590 bytes/s a 0:00:01    |█                               | 29.9 MB 14.7 MB/s eta 0:00:58
Installing collected packages: torch, torchtext
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.10.0+cu111 requires torch==1.9.0, but you have torch 1.10.0 which is incompatible.
torchaudio 0.9.0 requires torch==1.9.0, but you have torch 1.10.0 which is incompatible.[0m
Successfully installed torch-1.10.0 torchtext-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from torch.utils.data import DataLoader
from torchtext.legacy.data import Field, BucketIterator, TabularDataset

import os
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
device = torch.device('cpu')

## Data preparation

### Preparing Corpus sentences

The models have been trained with two different datasets: one that comes from the Wikipedia and one that comes from Spanish books in Project Guthenberg.
- For the first dataset, run processing_wiki_data.py to get "wiki_sent.csv".
    - Then, in the console, I did tail -100000 wiki_sent.csv > medium_wiki_sent.csv
- For the second dataset, run the following cell

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize
# sent_tokenize(s)

f = open('data/prova.txt', 'r')
l = f.readlines()
f.close()

endtokens = '.?!:'

g = open('sentences.csv', 'w')
s = ''
for line in l:
    if line[-2] in endtokens:
        s += line[:-1] + '\n'
        g.write(s)
        s = ''
    else:
        s+= line[:-1]
g.close()

In [3]:
def get_data(wiki_path):
    whitespacer = lambda x: x.split(' ')
    
    WORDS = Field(tokenize    = whitespacer,
                lower       = True,
                batch_first = True,
                init_token='<start>',
                eos_token='<end>') 
    
    # read the csv file
    wikipedia = TabularDataset(path = wiki_path, # wiki_sent.csv
                            format = 'csv',
                            fields = [('sentence', WORDS)],
                            skip_header       = True,
                            csv_reader_params = {'quotechar':'Ö'}) 
    
    # build vocabularies based on what our csv files contained and create word2id mapping
    WORDS.build_vocab(wikipedia)

    # create batches from our data, and shuffle them for each epoch
    wikipedia_iter = BucketIterator(wikipedia,
                                  batch_size        = 8,
                                  sort_within_batch = True,
                                  sort_key          = lambda x: len(x.sentence),
                                  shuffle           = True,
                                  device            = device)

    return wikipedia_iter, WORDS

### Preparing WCS sentences

In [4]:
data_un = '/home/gusgraupa@GU.GU.SE/MLNLP2/LT2326-Project/data/Dataset_with_pn_undescores_det.txt'

In [5]:
def sentence_creation(filename):
    f = open(filename, 'r')
    l = f.readlines()
    
    grouped = []
    j = []
    for e in l:
        if e == '\n':
            grouped.append(j)
            j = []
        else:
            j.append(e[:-1])
            
    sentences = []
    
    for group in grouped:
        
        if group[1] == 'su' or group[1] == 'sus':
            sentence = group[0][:-1]
            words = sentence.split()
            for i, w in enumerate(words):
                if w == '_' + group[1] + '_': 
                    words.remove(w)
                    words_copy = words.copy()
                    words.insert(i+1, 'de ' + group[2])
                    words_copy.insert(i+1, 'de ' + group[3])
                    
                    s1 = ' '.join(words) + '.'
                    s2 = ' '.join(words_copy) + '.'
                    
                    g1 = (s1.lower(), group[2] == group[4])
                    g2 = (s2.lower(), group[3] == group[4])
        
        else:
            check = '_' + group[1] + '_'
            
            s1 = group[0].replace(check, group[2])
            s2 = group[0].replace(check, group[3])
            
            g1 = (s1.lower(), group[2] == group[4])
            g2 = (s2.lower(), group[3] == group[4])
        sentences.append((g1, g2))
        
    f.close()
    return sentences

In [6]:
sentences = sentence_creation(data_un)
# Example of the pairs of sentences
sentences[0]

(('los concejales de la ciudad denegaron el permiso a los manifestantes porque los concejales de la ciudad temían la violencia.',
  True),
 ('los concejales de la ciudad denegaron el permiso a los manifestantes porque los manifestantes temían la violencia.',
  False))

## Language Model

In [7]:
epochs = 5
batch_size = 8
learning_rate = 0.001
embedding_size = 256
hidden_size = 128

In [8]:
class LSTM(nn.Module):
    def __init__(self, vocab_len, embedding_size, hidden_size):

        super(LSTM, self).__init__()
        
        self.vocab_len = vocab_len
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_len, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.fc2 = nn.Linear(hidden_size//2, vocab_len)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):

        embeds = self.embedding(x)
        out, hidden = self.lstm(embeds)
        fc1_out = self.fc1(out)
        fc2_out = self.fc2(fc1_out)
        output = self.softmax(fc2_out)

        return output

In [9]:
# training function

def train(path, epochs, batch_size, learning_rate, embedding_size, hidden_size, device):
    
    # loading the data
    dataset, vocab = get_data(path)
    
    model = LSTM(len(vocab.vocab), embedding_size, hidden_size)
    model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # start training loop
    model.train()
    total_loss = 0
    
    for epoch in range(epochs):
        
        for i, batch in enumerate(dataset):

            # the strucure for each BATCH is:
            # <start>, w0, ..., wn, <end>
            sentence = batch.sentence

            # we do not want to give <end> as input to the model
            input_sentence = sentence[:, :-1]

            # send to model
            output = model(input_sentence)

            # we select all but the first token from sentences
            target = sentence[:, 1:]
            
            # loss
            loss = loss_fn(output.permute(0, 2, 1), target)
            total_loss += loss.item()

            # print average loss for the epoch
            print(total_loss/(i+1), end='\r') 

            # compute gradients
            loss.backward()

            # update parameters
            optimizer.step()

            # reset gradients
            optimizer.zero_grad()

        print()
    
    return model

#### Model created with wikipedia dataset

In [10]:
# with medium_wiki_sent.csv
model_medium = train('medium_wiki_sent.csv', epochs, batch_size, learning_rate, embedding_size, hidden_size, device)

7.4633059020423895
14.158298510017396
20.546251615085602
26.662729534435277
32.530420682411295


In [11]:
# saving the model
torch.save(model_medium, 'lstm_model_medium.pt')

In [20]:
model_medium = torch.load('lstm_model_medium.pt').to(device)

#### Model created with book dataset

In [10]:
# with new_dataset
model_book = train('sentences.csv', epochs, batch_size, learning_rate, embedding_size, hidden_size, device)

5.4445716465479365
10.270369262541992
14.855445499952924
19.255806524329685
23.495890311415664


In [11]:
# saving the model
torch.save(model_book, 'lstm_model_book.pt')

## Calculating sentence probabilities

In [13]:
def getting_prob(sentences, vocab, model):
    probs = []
    n = 0
    tot = 0
    
    for sent in sentences: # ((Sent1, True), (Sent2, False))
        tot += 1
        for pair in sent:
            if pair[1]:
                true_sent = pair[0]
            else:
                false_sent = pair[0]

        # tokenizing sent
        tok_true_sent = true_sent.split()
        tok_false_sent = false_sent.split()

        # encoding
        enc_true_sent = torch.tensor([vocab.vocab.stoi[x] for x in tok_true_sent], device=device)
        enc_false_sent = torch.tensor([vocab.vocab.stoi[x] for x in tok_false_sent], device=device)

        # model
        out_true_sent = model(enc_true_sent.unsqueeze(0))
        out_false_sent = model(enc_false_sent.unsqueeze(0))

        # get probabilities 
        true_prob = F.softmax(out_true_sent, dim=2)
        false_prob = F.softmax(out_false_sent, dim=2)

        # append
        true_max = torch.max(true_prob, dim=2)
        false_max = torch.max(false_prob, dim=2)
        
        true_sent_prob = torch.prod(true_max.values) # sum?
        false_sent_prob = torch.prod(false_max.values)
        
        probs.append((true_sent_prob, false_sent_prob))
        n += int(true_sent_prob > false_sent_prob)
    
    print('Accuracy:', n/tot)
    return probs

#### Model created with wikipedia dataset

In [14]:
dataset_wiki, vocab_wiki = get_data('medium_wiki_sent.csv')

In [21]:
probs_wiki = getting_prob(sentences, vocab_wiki, model_medium)

Accuracy: 0.4074074074074074


#### Model created with book dataset

In [18]:
dataset_book, vocab_book = get_data('sentences.csv')

In [19]:
probs_book = getting_prob(sentences, vocab_book, model_book)

Accuracy: 0.44074074074074077
