# Model 1: Attention

In [91]:
from importlib import reload  

import pandas as pd
import matplotlib.pyplot as plt
import pickle

import help_functions
import data_processor

## STRUCTURE
1. Add the different word senses to the texts.
2. Build a vocabulary.
3. Build the model with embedding and Attention.
5. Train on the data.

## 0. BASIC HELP FUNCTIONS

In [2]:
def save_data_to_file(data):
    print("Given data with head:")
    print(data.head())
    should_save = input("Do you wish to save it? (y/n): ")
    if should_save == "y":
        filename = input("Specify the filename to save to: ")
        data.to_csv(filename, index=False)
        print("Saved data!")

In [3]:
def load_data_from_file():
    should_load = input("Do you wish to load data from a file? (y/n): ")
    if should_load == "y":
        filename = input("Specify the filename to load from: ")
        data = pd.read_csv(filename)
        return data

## 1. Add the different word senses to the texts.

In [4]:
data = load_data_from_file()

In [5]:
filename = "/Users/lovhag/Projects/dl4nlp_assignment_1/a1_data/wsd_train.txt"
data = pd.read_table(filename,header=None,names=['sense_key', 'lemma', 'word_position', 'text'])
#data = data.iloc[0:10]
data.head()

Unnamed: 0,sense_key,lemma,word_position,text
0,keep%2:42:07::,keep.v,15,Action by the Committee In pursuance of its ma...
1,national%3:01:00::,national.a,25,A guard of honour stood in formation in honour...
2,build%2:31:03::,build.v,38,The principle that statistics should be timely...
3,place%1:04:00::,place.n,36,"Again , he appealed for additional support for..."
4,position%1:04:01::,position.n,76,"Also , the IAEA has the lowest number of women..."


In [6]:
processor = data_processor.DataProcessor(data.text.to_list(), data.lemma.to_list(), data.word_position.to_list(), data.sense_key.to_list())
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::
3,"Again , he appealed for additional support for...",place.n,36,place%1:04:00::
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::


In [7]:
processor.fix_period_spaces_and_word_index_in_data()
processor.fix_quotations_and_word_index_in_data()
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::


In [8]:
processor.lemmatize_text_in_data()
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key,lemmatized_text
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::,action by the Committee in pursuance of -PRON-...
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::,a guard of honour stand in formation in honour...
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::,the principle that statistic should be timely ...
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::,"again , -PRON- appeal for additional support f..."
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::,"also , the IAEA have the low number of woman p..."


In [9]:
def check_lemmas_matching(row, text_col, word_pos_col):
    text_splitted = row[text_col].split(' ')
    lemma_in_text = text_splitted[row[word_pos_col]]
    return lemma_in_text == row.lemma[:-2]

In [10]:
lemmas_matching_col = processor.get_data().apply(lambda x: check_lemmas_matching(x, 'lemmatized_text', 'word_pos'), axis=1)

In [11]:
lemmas_matching_col[lemmas_matching_col == False]

45       False
61       False
93       False
172      False
215      False
         ...  
75902    False
75903    False
75958    False
75967    False
76048    False
Length: 1979, dtype: bool

In [12]:
def look_at_data_example(data, show_index):
    row = data.iloc[show_index]
    print(f"lemma: {row.lemma}")
    print("")
    print(f"original: {row.text}")
    print("")
    print(f"lemmatized: {row.lemmatized_text}")
    print("")
    text_splitted = row.text.split(' ')
    lemmatized_text_splitted = row.lemmatized_text.split(' ')
    print(f"lemma in original: {text_splitted[row.word_pos]}")
    print(f"lemma in lemmatized: {lemmatized_text_splitted[row.word_pos]}")

In [13]:
look_at_data_example(processor.get_data(), 45)

lemma: time.n

original: To begin with , South-North dialogue encompasses a broad field , from political and diplomatic exchanges through trade , poverty eradication , investment , technology , industrialization , capacity-building and financing for development to the empowering of people . A recent cover story in Time magazine featured our esteemed Secretary-General . It called him a dreamer .

lemmatized: to begin with , south-north dialogue encompass a broad field , from political and diplomatic exchange through trade , poverty eradication , investment , technology , industrialization , capacity-building and financing for development to the empowering of people . a recent cover story in Time magazine feature -PRON- esteemed secretary-general . -PRON- call -PRON- a dreamer .

lemma in original: Time
lemma in lemmatized: Time


In [14]:
processor.sense_encode_text_in_data('lemmatized_text')
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key,lemmatized_text,sensed_lemma,sense_encoded_text
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::,action by the Committee in pursuance of -PRON-...,keep_1,"[action, by, the, Committee, in, pursuance, of..."
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::,a guard of honour stand in formation in honour...,national_1,"[a, guard, of, honour, stand, in, formation, i..."
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::,the principle that statistic should be timely ...,build_1,"[the, principle, that, statistic, should, be, ..."
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::,"again , -PRON- appeal for additional support f...",place_1,"[again, ,, -PRON-, appeal, for, additional, su..."
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::,"also , the IAEA have the low number of woman p...",position_1,"[also, ,, the, IAEA, have, the, low, number, o..."


In [15]:
data = processor.get_data()

Potentially check some of the less frequent lemmas.

In [18]:
data.head()

Unnamed: 0,text,lemma,word_pos,sense_key,lemmatized_text,sensed_lemma,sense_encoded_text
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::,action by the Committee in pursuance of -PRON-...,keep_1,"[action, by, the, Committee, in, pursuance, of..."
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::,a guard of honour stand in formation in honour...,national_1,"[a, guard, of, honour, stand, in, formation, i..."
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::,the principle that statistic should be timely ...,build_1,"[the, principle, that, statistic, should, be, ..."
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::,"again , -PRON- appeal for additional support f...",place_1,"[again, ,, -PRON-, appeal, for, additional, su..."
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::,"also , the IAEA have the low number of woman p...",position_1,"[also, ,, the, IAEA, have, the, low, number, o..."


In [19]:
data.iloc[0].sense_encoded_text

['action',
 'by',
 'the',
 'Committee',
 'in',
 'pursuance',
 'of',
 '-PRON-',
 'mandate',
 ',',
 'the',
 'Committee',
 'will',
 'continue',
 'to',
 'keep_1',
 'under',
 'review',
 'the',
 'situation',
 'relate',
 'to',
 'the',
 'question',
 'of',
 'Palestine',
 'and',
 'participate',
 'in',
 'relevant',
 'meeting',
 'of',
 'the',
 'General',
 'Assembly',
 'and',
 'the',
 'Security',
 'Council',
 '.',
 'the',
 'Committee',
 'will',
 'also',
 'continue',
 'to',
 'monitor',
 'the',
 'situation',
 'on',
 'the',
 'ground',
 'and',
 'draw',
 'the',
 'attention',
 'of',
 'the',
 'international',
 'community',
 'to',
 'urgent',
 'development',
 'in',
 'the',
 'occupied',
 'Palestinian',
 'Territory',
 ',',
 'include',
 'East',
 'Jerusalem',
 ',',
 'require',
 'international',
 'action',
 '.']

In [276]:
data = processor.get_data()
data[data.sensed_lemma=="force_4"].iloc[2].text

'Reiterating its full support for the efforts of the Secretary-General , the African Union and regional actors to find solutions to armed conflicts in the region , Reaffirming that any attempt at destabilization through violent means or seizing power by force is unacceptable , Reaffirming its resolutions 1325 ( 2000 ) and 1820 ( 2008 ) on women , peace and security , 1502 ( 2003 ) on the protection of humanitarian and United Nations personnel , and 1674 ( 2006 ) on the protection of civilians in armed conflict ,'

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76049 entries, 0 to 76048
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   text                76049 non-null  object
 1   lemma               76049 non-null  object
 2   word_pos            76049 non-null  int64 
 3   sense_key           76049 non-null  object
 4   lemmatized_text     76049 non-null  object
 5   sensed_lemma        76049 non-null  object
 6   sense_encoded_text  76049 non-null  object
dtypes: int64(1), object(6)
memory usage: 4.1+ MB


In [21]:
save_data_to_file(processor.get_data())

Given data with head:
                                                text       lemma  word_pos  \
0  Action by the Committee In pursuance of its ma...      keep.v        15   
1  A guard of honour stood in formation in honour...  national.a        25   
2  The principle that statistics should be timely...     build.v        38   
3  Again , he appealed for additional support for...     place.n        37   
4  Also , the IAEA has the lowest number of women...  position.n        76   

            sense_key                                    lemmatized_text  \
0      keep%2:42:07::  action by the Committee in pursuance of -PRON-...   
1  national%3:01:00::  a guard of honour stand in formation in honour...   
2     build%2:31:03::  the principle that statistic should be timely ...   
3     place%1:04:00::  again , -PRON- appeal for additional support f...   
4  position%1:04:01::  also , the IAEA have the low number of woman p...   

  sensed_lemma                                 sense

## 2. Build the vocabulary

In [22]:
voc = help_functions.Vocabulary(min_word_freq=22, include_unknown=True, lower=True, character=False)
voc.build(processor.get_data().sense_encoded_text)
len(voc)

8420

In [23]:
voc.stoi['line_9']

8312

## 2.* Create data fitting to the model

In [76]:
def pad_sequence(sequence, max_sequence_len):
    return sequence+[voc.get_pad_idx()]*(max_sequence_len-len(sequence))

In [77]:
def create_training_data(data, max_sequence_len=None):
    def build_X_elem(X):
        X_elem = voc.encode([X])[0]
        if max_sequence_len:
            return pad_sequence(X_elem, max_sequence_len)
        return X_elem

    sense_dict = help_functions.build_sense_dict(data.lemma.to_list(), data.sense_key.to_list())
    X_data = []
    y_data = []
    for index, row in data.iterrows():
        X_data.append(build_X_elem(row.sense_encoded_text))
        y_data.append([1])

        # append faulty sense examples
        available_senses = list(sense_dict[row.lemma].keys())
        available_senses.remove(row.sense_key)
        for sense in available_senses:
            faulty_text = row.sense_encoded_text.copy()
            #print(faulty_text)
            faulty_text[row.word_pos] = row.lemma[:-2]+"_"+str(sense_dict[row.lemma][sense])
            X_data.append(build_X_elem(faulty_text.copy()))
            y_data.append([0])
    return X_data, y_data

In [189]:
import numpy as np
v = [0,1,2]
np.random.choice(len(v),1)[0]

1

In [190]:
def create_equal_training_data(data, max_sequence_len=None):
    def build_X_elem(X):
        X_elem = voc.encode([X])[0]
        if max_sequence_len:
            return pad_sequence(X_elem, max_sequence_len)
        return X_elem

    sense_dict = help_functions.build_sense_dict(data.lemma.to_list(), data.sense_key.to_list())
    X_data = []
    y_data = []
    for index, row in data.iterrows():
        X_data.append(build_X_elem(row.sense_encoded_text))
        y_data.append([1])

        # append one faulty sense example
        available_senses = list(sense_dict[row.lemma].keys())
        available_senses.remove(row.sense_key)
        faulty_sense = available_senses[np.random.choice(len(v), 1)[0]]
        faulty_text = row.sense_encoded_text.copy()
        faulty_text[row.word_pos] = row.lemma[:-2]+"_"+str(sense_dict[row.lemma][faulty_sense])
        X_data.append(build_X_elem(faulty_text.copy()))
        y_data.append([0])
    return X_data, y_data

In [73]:
def find_max_sequence_length(sequence_list):
    max_sequence_len = 0
    for sequence in sequence_list:
        if len(sequence) > max_sequence_len:
            max_sequence_len = len(sequence)
    return max_sequence_len

Max sequence length of data seems to be 283.

In [110]:
max_sequence_length = 283

In [74]:
find_max_sequence_length(X_data)

283

In [87]:
X_data, y_data = create_training_data(data, 283)

In [191]:
X_data_eq, y_data_eq = create_equal_training_data(data, 283)
print(f'Number of samples: {len(y_data_eq)}')
print(f'Sequence length per sample: {len(X_data_eq[0])}')

Number of samples: 152098
Sequence length per sample: 283


In [99]:
def save_data_with_pickle(data_dict):
    pre_filename = input("Specify which prefix filename you wish to save X_data and y_data to: ")
    if pre_filename:
        for key, value in data_dict.items():
            filename = pre_filename+"_"+key+".pickle"
            with open(filename, "wb") as fp:   #Pickling
                pickle.dump(value, fp)

In [100]:
save_data_with_pickle({"X_data": X_data, "y_data": y_data})

In [192]:
save_data_with_pickle({"X_data_eq": X_data_eq, "y_data_eq": y_data_eq})

Split into train and validation set.

In [97]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.33, random_state=42)

In [101]:
save_data_with_pickle({"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test})

In [193]:
X_train_eq, X_test_eq, y_train_eq, y_test_eq = train_test_split(X_data_eq, y_data_eq, test_size=0.33, random_state=42)
save_data_with_pickle({"X_train_eq": X_train_eq, "X_test_eq": X_test_eq, "y_train_eq": y_train_eq, "y_test_eq": y_test_eq})

In [109]:
save_data_with_pickle({"voc": voc})

## 3. Build the model

### Get the data

In [196]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

In [201]:
X_train = load_pickle_data("saved_data/splitted_X_train_eq.pickle")
X_test = load_pickle_data("saved_data/splitted_X_test_eq.pickle")
y_train = load_pickle_data("saved_data/splitted_y_train_eq.pickle")
y_test = load_pickle_data("saved_data/splitted_y_test_eq.pickle")

In [231]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [130]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using', device)

Using cpu


In [202]:
print(f'Number of training samples: {len(y_train)}')
print(f'Number of test samples: {len(y_test)}')
print("")
print(f'Sequence length per sample: {len(X_train[0])}')

Number of training samples: 101905
Number of test samples: 50193

Sequence length per sample: 283


In [131]:
def batchify(x, y, batch_size):
    random_indices = torch.randperm(len(x))
    for i in range(0, len(x) - batch_size + 1, batch_size):
        indices = random_indices[i:i+batch_size]
        yield x[indices].to(device), y[indices].to(device)

In [132]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [229]:
class MyAttentionModel(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.decoder = nn.Linear(embedding_dim*max_seq_len, 1)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        #print(f"Input shape: {src.shape}")
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded shape: {src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        output = output.view(output.shape[0], -1)
        #print(f"Reshaped output shape: {output.shape}")
        output = self.decoder(output)
        #print(f"Decoder output shape: {output.shape}")
        #print(h)
        return F.log_softmax(output, dim=-1)

## Train the transformer!

In [233]:
def train(model, X_train, X_test, y_train, y_test, n_epochs=1, batch_size=100, lr=0.001, max_samples=None, weight_true=0.5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    min_ppl = float('inf')
    for t in range(n_epochs):
        model.train()
        loss_fun = F.binary_cross_entropy

        loss_sum = 0
        accuracy_sum = 0
        nbr_train_batches = 0
        for bx, by in batchify(X_train, y_train, batch_size):
            nbr_train_batches += 1
            optimizer.zero_grad()
            output = model.forward(bx)
            sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
            #print(sample_weight)
            loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
            loss_sum += loss.item()
            accuracy = (output.eq(by)).sum()
            accuracy_sum += accuracy

            if max_samples and updater.n >= max_samples:
                break
            loss.backward()
            optimizer.step()

        train_loss = loss_sum/(nbr_train_batches*batch_size)
        train_acc = torch.true_divide(accuracy_sum,(nbr_train_batches*batch_size))
        model.eval()
        with torch.no_grad():
            loss_sum = 0
            accuracy_sum = 0
            nbr_test_batches = 0
            for bx, by in batchify(X_test, y_test, batch_size):
                nbr_test_batches += 1
                output = model.forward(bx)
                sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
                loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
                loss_sum += loss.item()
                accuracy = (output.eq(by)).sum()
                accuracy_sum += accuracy
        test_loss = loss_sum/(nbr_test_batches*batch_size)
        test_acc = torch.true_divide(accuracy_sum,(nbr_test_batches*batch_size))

        print(f'epoch {t} | train loss {train_loss} | train acc {train_acc} | validation loss {test_loss} | validation acc {test_acc}')

    return model

In [208]:
def get_data_subset(sub_percentage, X_train, X_test, y_train, y_test):
    train_sub_size = int(sub_percentage*len(y_train))
    test_sub_size = int(sub_percentage*len(y_test))

    X_train_sub = X_train[:train_sub_size]
    X_test_sub = X_test[:test_sub_size]
    y_train_sub = y_train[:train_sub_size]
    y_test_sub = y_test[:test_sub_size]

    return X_train_sub, X_test_sub, y_train_sub, y_test_sub

In [272]:
sample_index = 2005
sample = [voc.itos[i] for i in data_subset[0][sample_index]]
sample_label = data_subset[2][sample_index]
print(f"label: {sample_label}")
print(sample)

label: [0]
['___BOS___', 'the', 'foreign', 'ministers', 'of', 'egypt', ',', 'ireland', ',', 'mexico', ',', 'new', 'zealand', ',', 'south', 'africa', ',', 'sweden', 'and', 'brazil', 'meet', 'at', 'the', 'fifty-eighth', 'session', 'of', 'the', 'united', 'nations', 'general', 'assembly', 'to', 'review', 'development', 'on', 'nuclear', 'disarmament', 'and', 'to', 'renew', '-pron-', 'commitment', 'to', 'achieve', 'a', 'world', 'free', 'from', 'nuclear', 'weapon', '.', 'the', 'ministers', 'pay', 'tribute', 'to', 'the', 'memory', 'of', '___UNKNOWN___', '___UNKNOWN___', ',', 'foreign', 'minister', 'of', 'sweden', ',', 'on', 'the', 'occasion', 'of', '-pron-', 'sad', 'pass', 'away', ',', 'and', 'deplore', 'the', 'loss', 'of', 'a', 'devoted', 'colleague', 'who', 'have', 'be', 'a', 'drive', 'force_4', 'in', 'the', 'common', 'cause', '.', 'the', 'ministers', 'express', '-pron-', 'deep', 'concern', 'at', 'the', 'lack', 'of', 'progress', 'to', 'date', 'in', 'the', 'implementation', 'of', 'the', '___U

In [209]:
data_subset = get_data_subset(0.01, X_train, X_test, y_train, y_test)

model = MyAttentionModel(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.1).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=5, batch_size=32, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.7985761088709677 | train acc 0.48891130089759827 | validation loss 0.771484375 | validation acc 0.5062500238418579
epoch 1 | train loss 0.8001512096774194 | train acc 0.4879032373428345 | validation loss 0.7584635416666666 | validation acc 0.5145833492279053
epoch 2 | train loss 0.8033014112903226 | train acc 0.4858871102333069 | validation loss 0.751953125 | validation acc 0.518750011920929
epoch 3 | train loss 0.8048765120967742 | train acc 0.4848790466785431 | validation loss 0.7649739583333334 | validation acc 0.5104166865348816
epoch 4 | train loss 0.8001512096774194 | train acc 0.4879032373428345 | validation loss 0.771484375 | validation acc 0.5062500238418579


In [210]:
data_subset = get_data_subset(0.1, X_train, X_test, y_train, y_test)

model = MyAttentionModel(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.1).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=5, batch_size=32, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.7820177378144654 | train acc 0.49950864911079407 | validation loss 0.7900140224358975 | validation acc 0.4943910241127014
epoch 1 | train loss 0.7827854756289309 | train acc 0.49901729822158813 | validation loss 0.7903270232371795 | validation acc 0.49419069290161133
epoch 2 | train loss 0.7821712853773585 | train acc 0.49941039085388184 | validation loss 0.7893880208333334 | validation acc 0.4947916567325592
epoch 3 | train loss 0.7827854756289309 | train acc 0.49901729822158813 | validation loss 0.7903270232371795 | validation acc 0.49419069290161133
epoch 4 | train loss 0.7826319280660378 | train acc 0.49911555647850037 | validation loss 0.7893880208333334 | validation acc 0.4947916567325592


In [211]:
data_subset = get_data_subset(0.01, X_train, X_test, y_train, y_test)

model = MyAttentionModel(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.1).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=10, batch_size=32, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.809601814516129 | train acc 0.4818548262119293 | validation loss 0.76171875 | validation acc 0.512499988079071
epoch 1 | train loss 0.7922757056451613 | train acc 0.49294355511665344 | validation loss 0.76171875 | validation acc 0.512499988079071
epoch 2 | train loss 0.8033014112903226 | train acc 0.4858871102333069 | validation loss 0.7454427083333334 | validation acc 0.5229166746139526
epoch 3 | train loss 0.8033014112903226 | train acc 0.4858871102333069 | validation loss 0.7649739583333334 | validation acc 0.5104166865348816
epoch 4 | train loss 0.7970010080645161 | train acc 0.48991936445236206 | validation loss 0.7584635416666666 | validation acc 0.5145833492279053
epoch 5 | train loss 0.8001512096774194 | train acc 0.4879032373428345 | validation loss 0.76171875 | validation acc 0.512499988079071
epoch 6 | train loss 0.7985761088709677 | train acc 0.48891130089759827 | validation loss 0.7584635416666666 | validation acc 0.5145833492279053
epoch 7 | train l

In [278]:
data_subset = get_data_subset(1, X_train, X_test, y_train, y_test)

model = MyAttentionModel(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=4, dim_feedforward=32, num_layers=1, dropout=0).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=20, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.3897662138819096 | train acc 0.5010992288589478 | validation loss 0.39243114237882654 | validation acc 0.4976881444454193
epoch 1 | train loss 0.3897585461487123 | train acc 0.5011090636253357 | validation loss 0.392337721221301 | validation acc 0.49780771136283875
epoch 2 | train loss 0.389781549348304 | train acc 0.5010796189308167 | validation loss 0.39238443180006377 | validation acc 0.49774792790412903
epoch 3 | train loss 0.3897508784155151 | train acc 0.5011188983917236 | validation loss 0.3924155721859056 | validation acc 0.4977080821990967
epoch 4 | train loss 0.389781549348304 | train acc 0.5010796189308167 | validation loss 0.39238443180006377 | validation acc 0.49774792790412903
epoch 5 | train loss 0.3897585461487123 | train acc 0.5011090636253357 | validation loss 0.39236886160714285 | validation acc 0.4977678656578064
epoch 6 | train loss 0.3897738816151068 | train acc 0.5010894536972046 | validation loss 0.392337721221301 | validation acc 0.497807

In [1]:
data_subset = get_data_subset(1, X_train, X_test, y_train, y_test)

model = MyAttentionModel(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=4, dim_feedforward=32, num_layers=1, dropout=0).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=20, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

NameError: name 'get_data_subset' is not defined

In [238]:
model

MyAttentionModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=2, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (encoder): Embedding(8420, 16)
  (decoder): Linear(in_features=4528, out_features=1, bias=True)
)

In [239]:
nbr_params = 0
for parameter in model.parameters():
    nbr_params = nbr_params + len(parameter)
nbr_params

8650