<!-- # Final Project -->

In [None]:
import numpy as np
import torch
from encoder_decoder import Seq2Seq
from train import run_trial, evaluate
from helper import csv_to_datasets
from data_loader import Dataset, Vocabulary
from analysis import plot_boxplots, plot_filtered_validation_and_test_accuracy
import torch.nn as nn

In [16]:
%load_ext autoreload
%autoreload 2


## DATA LOADING

In [None]:
csv_to_datasets('data/aave.csv')
csv_to_datasets('data/standard.csv')
csv_to_datasets('data/mix.csv')

# LEAVE COMMENTED OUT
# csv_to_datasets('data/general.csv')

Train, dev, and test files created in: 'data'


In [43]:
from data_loader import Vocabulary, Dataset

standard_vocab = Vocabulary.from_paired_file('data/standard.train')
standard_train = Dataset.from_paired_file('data/standard.train', standard_vocab, sort_by_length=False)
standard_dev = Dataset.from_paired_file('data/standard.dev', standard_vocab)
standard_test = Dataset.from_paired_file('data/standard.test', standard_vocab)

In [44]:
aave_vocab = Vocabulary.from_paired_file('data/aave.train')
aave_train = Dataset.from_paired_file('data/aave.train', aave_vocab, sort_by_length=False)
aave_dev = Dataset.from_paired_file('data/aave.dev', aave_vocab)
aave_test = Dataset.from_paired_file('data/aave.test', aave_vocab)

In [45]:
mix_vocab = Vocabulary.from_paired_file('data/mix.train')
mix_train = Dataset.from_paired_file('data/mix.train', mix_vocab, sort_by_length=False)
mix_dev = Dataset.from_paired_file('data/mix.dev', mix_vocab)
mix_test = Dataset.from_paired_file('data/mix.test', mix_vocab)

## TRAINING

In [60]:
repeats = 1

In [61]:
# STANDARD A
embedding_size = 50
hidden_size = 50
attention_size = 50
encoder_output_size = 50
recurrent_type = 'gru'
dropout_prob = .25
bidirectional_encoder = True
use_attention = False
batch_size = 10
lr = .001
num_epochs = 4

standard_model_A = Seq2Seq(len(standard_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)

aave_model_A = Seq2Seq(len(aave_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


mix_model_A = Seq2Seq(len(mix_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


for i in range(repeats):
    run_trial(standard_model_A, standard_train, standard_dev, standard_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='standard_A.pt', dataname = 'standard', config = 'A')

for i in range(repeats):
    run_trial(aave_model_A, aave_train, aave_dev, aave_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='aave_A.pt', dataname = 'aave', config = 'A')

for i in range(repeats):
    run_trial(mix_model_A, mix_train, mix_dev, mix_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='mix_A.pt', dataname = 'mix', config = 'A')


Epoch 1
Training Complete. Loss: 5.47, Accuracy: 0.096
Epoch 1 Validation. Loss: 5.06, Accuracy: 0.099
Saving checkpoint...
Done. Time elapsed: 0.014 seconds
Epoch 2
Training Complete. Loss: 4.62, Accuracy: 0.162
Epoch 2 Validation. Loss: 4.64, Accuracy: 0.116
Saving checkpoint...
Done. Time elapsed: 0.006 seconds
Epoch 3
Training Complete. Loss: 4.09, Accuracy: 0.178
Epoch 3 Validation. Loss: 4.44, Accuracy: 0.105
Epoch 4
Training Complete. Loss: 3.69, Accuracy: 0.218
Epoch 4 Validation. Loss: 4.34, Accuracy: 0.105
The best validation accuracy of 0.116 occurred after epoch 2.
Test. Loss: 4.89, Accuracy: 0.118
['standard', 'A', 0.09944751381215469, 0.11602209944751381, 0.10497237569060773, 0.10497237569060773, 0.11602209944751381, 0.11822660098522167]
Epoch 1
Training Complete. Loss: 5.44, Accuracy: 0.099
Epoch 1 Validation. Loss: 4.90, Accuracy: 0.133
Saving checkpoint...
Done. Time elapsed: 0.009 seconds
Epoch 2
Training Complete. Loss: 4.62, Accuracy: 0.129
Epoch 2 Validation. Loss:

In [None]:
# STANDARD B
embedding_size = 100
hidden_size = 100
attention_size = 50
encoder_output_size = 100
recurrent_type = 'gru'
dropout_prob = .25
bidirectional_encoder = True
use_attention = True
batch_size = 5
lr = .01
num_epochs = 5

standard_model_B = Seq2Seq(len(standard_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)

aave_model_B = Seq2Seq(len(aave_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


mix_model_B = Seq2Seq(len(mix_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


for i in range(repeats):
    run_trial(standard_model_B, standard_train, standard_dev, standard_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='standard_B.pt', dataname = 'standard', config = 'B')

for i in range(repeats):
    run_trial(aave_model_B, aave_train, aave_dev, aave_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='aave_B.pt', dataname = 'aave', config = 'B')

for i in range(repeats):
    run_trial(mix_model_B, mix_train, mix_dev, mix_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='mix_B.pt', dataname = 'mix', config = 'B')


Epoch 1
Training Complete. Loss: 4.54, Accuracy: 0.230
Epoch 1 Validation. Loss: 3.15, Accuracy: 0.376
Saving checkpoint...
Done. Time elapsed: 0.011 seconds
Epoch 2
Training Complete. Loss: 1.51, Accuracy: 0.627
Epoch 2 Validation. Loss: 0.94, Accuracy: 0.762
Saving checkpoint...
Done. Time elapsed: 0.008 seconds
Epoch 3
Training Complete. Loss: 0.64, Accuracy: 0.820
Epoch 3 Validation. Loss: 0.88, Accuracy: 0.834
Saving checkpoint...
Done. Time elapsed: 0.008 seconds
Epoch 4
Training Complete. Loss: 0.32, Accuracy: 0.906
Epoch 4 Validation. Loss: 0.65, Accuracy: 0.862
Saving checkpoint...
Done. Time elapsed: 0.007 seconds
The best validation accuracy of 0.862 occurred after epoch 4.
Test. Loss: 0.38, Accuracy: 0.906
['standard', 'B', 0.3756906077348066, 0.7624309392265194, 0.8342541436464088, 0.861878453038674, 0.861878453038674, 0.9064039408866995]
Epoch 1
Training Complete. Loss: 0.27, Accuracy: 0.920
Epoch 1 Validation. Loss: 0.85, Accuracy: 0.867
Saving checkpoint...
Done. Time e

In [None]:
# STANDARD C
embedding_size = 20
hidden_size = 20
attention_size = 20
encoder_output_size = 20
recurrent_type = 'rnn'
dropout_prob = .25
bidirectional_encoder = False
use_attention = True
batch_size = 5
lr = .01
num_epochs = 5
standard_model_C = Seq2Seq(len(standard_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)

aave_model_C = Seq2Seq(len(aave_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


mix_model_C = Seq2Seq(len(mix_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


for i in range(repeats):
    run_trial(standard_model_C, standard_train, standard_dev, standard_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='standard_C.pt', dataname = 'standard', config = 'C')

for i in range(repeats):
    run_trial(aave_model_C, aave_train, aave_dev, aave_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='aave_C.pt', dataname = 'aave', config = 'C')

for i in range(repeats):
    run_trial(mix_model_C, mix_train, mix_dev, mix_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='mix_C.pt', dataname = 'mix', config = 'C')


Epoch 1
Training Complete. Loss: 5.73, Accuracy: 0.067
Epoch 1 Validation. Loss: 5.13, Accuracy: 0.094
Saving checkpoint...
Done. Time elapsed: 0.008 seconds
Epoch 2
Training Complete. Loss: 4.76, Accuracy: 0.172
Epoch 2 Validation. Loss: 4.35, Accuracy: 0.210
Saving checkpoint...
Done. Time elapsed: 0.003 seconds
Epoch 3
Training Complete. Loss: 4.10, Accuracy: 0.237
Epoch 3 Validation. Loss: 3.84, Accuracy: 0.243
Saving checkpoint...
Done. Time elapsed: 0.004 seconds
Epoch 4
Training Complete. Loss: 3.61, Accuracy: 0.275
Epoch 4 Validation. Loss: 3.46, Accuracy: 0.282
Saving checkpoint...
Done. Time elapsed: 0.003 seconds
The best validation accuracy of 0.282 occurred after epoch 4.
Test. Loss: 3.34, Accuracy: 0.281
['standard', 'C', 0.09392265193370165, 0.20994475138121546, 0.2430939226519337, 0.281767955801105, 0.281767955801105, 0.28078817733990147]
Epoch 1
Training Complete. Loss: 3.21, Accuracy: 0.313
Epoch 1 Validation. Loss: 3.17, Accuracy: 0.304
Saving checkpoint...
Done. Tim

In [None]:
# STANDARD D
embedding_size = 75
hidden_size = 75
attention_size = 70
encoder_output_size = 75
recurrent_type = 'rnn'
dropout_prob = .25
bidirectional_encoder = False
use_attention = True
batch_size = 5
lr = .01
num_epochs = 5

standard_model_D = Seq2Seq(len(standard_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)

aave_model_D = Seq2Seq(len(aave_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


mix_model_D = Seq2Seq(len(mix_vocab), embedding_size, hidden_size, encoder_output_size,
                      attention_size, recurrent_type=recurrent_type, 
                      dropout_prob=dropout_prob, bidirectional=bidirectional_encoder, use_attention=use_attention)


for i in range(repeats):
    run_trial(standard_model_D, standard_train, standard_dev, standard_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='standard_D.pt', dataname = 'standard', config = 'D')

for i in range(repeats):
    run_trial(aave_model_D, aave_train, aave_dev, aave_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='aave_D.pt', dataname = 'aave', config = 'D')

for i in range(repeats):
    run_trial(mix_model_D, mix_train, mix_dev, mix_test, vocab=standard_vocab, num_epochs=4, 
            batch_size = batch_size, report_frequency=1000, filename='mix_D.pt', dataname = 'mix', config = 'D')


Epoch 1
Training Complete. Loss: 5.11, Accuracy: 0.192
Epoch 1 Validation. Loss: 3.60, Accuracy: 0.309
Saving checkpoint...
Done. Time elapsed: 0.007 seconds
Epoch 2
Training Complete. Loss: 2.25, Accuracy: 0.512
Epoch 2 Validation. Loss: 1.29, Accuracy: 0.630
Saving checkpoint...
Done. Time elapsed: 0.006 seconds
Epoch 3
Training Complete. Loss: 1.11, Accuracy: 0.718
Epoch 3 Validation. Loss: 0.42, Accuracy: 0.867
Saving checkpoint...
Done. Time elapsed: 0.006 seconds
Epoch 4
Training Complete. Loss: 0.62, Accuracy: 0.824
Epoch 4 Validation. Loss: 0.09, Accuracy: 0.983
Saving checkpoint...
Done. Time elapsed: 0.006 seconds
The best validation accuracy of 0.983 occurred after epoch 4.
Test. Loss: 0.12, Accuracy: 0.970
['standard', 'D', 0.30939226519337015, 0.6298342541436464, 0.8674033149171271, 0.9834254143646409, 0.9834254143646409, 0.9704433497536946]
Epoch 1
Training Complete. Loss: 0.44, Accuracy: 0.879
Epoch 1 Validation. Loss: 0.08, Accuracy: 0.967
Saving checkpoint...
Done. Tim

<!-- #### Problem 21. 
 
How many epochs of training were needed to achieve the best validation accuracy? How well does your model perform on the test set?  In order to explore how the model is accomplishing the task of verb reinflection, you can try to use the heat maps of the attention mechanism.  A convenient way to do this is with the `test_tense_example` function in `analysis.py`. Given an input sentence and its target output, this function will print out the model's predicted output, along with a heat map of the attention weights.   -->

<!-- What patterns do you see in the attention weights? What can you tell from the attention weights about how the model is accomplishing the task of tense reinflection?  -->

<!-- As impressive as this performance is, the data that the model has been trained and evaluated on is limited in a significant respect. In sentences involving subjects modified by a prepositional phrase or relative clause, all of the nouns between the verb and the subject noun are identical in number. This means that while the model could succeed, as we would, by identifying the grammatical subject and using its grammatical number, it can also succeed simply by attending to the number of the closest noun. We will call the first of these strategies a *hierarchical* rule for agreement, since it depends on understanding something about the syntactic structure of the sentence. The second we will call a *linear* rule, since it depends on the linearly closest noun.  Our goal is to find out which of these two rules the seq2seq model is using.

To proceed, you should download a generalization data set from the following link:

* [generalization set](https://github.com/tommccoy1/rnn-hierarchical-biases/blob/master/data/tense.gen)

This dataset is similar to the test set on which the model was already evaluated at the conclusion of training, but includes sentences in which the verb is separated from the subject noun by a noun that differs in number.  You can load this dataset using the `from_paired_file` method in the `Dataset` class, as before. -->

In [50]:
dialect_vocab = Vocabulary.from_paired_file('data/general.gen')
dialect_gen = Dataset.from_paired_file('data/general.gen', dialect_vocab)

## Evaluating Performance of Standard, AAVE, Mix Model on General Dataset

<!-- #### Problem 22.

Evaluate your model on the generalization set using the `evaluate` function: -->

In [58]:
from train import evaluate

# STANDARD MODEL B EVALUATION
_ = evaluate(standard_model_B, dialect_gen, 
             nn.CrossEntropyLoss(ignore_index=dialect_vocab.get_index('[PAD]')),
             message = "Generalization set (standard model B)")

# AAVE MODEL B EVALUATION
_ = evaluate(aave_model_B, dialect_gen, 
             nn.CrossEntropyLoss(ignore_index=dialect_vocab.get_index('[PAD]')),
             message = "Generalization set (aave model B)")

# MIX MODEL EVALUATION
_ = evaluate(mix_model_B, dialect_gen, 
             nn.CrossEntropyLoss(ignore_index=dialect_vocab.get_index('[PAD]')),
             message = "Generalization set (mix model B)")

NameError: name 'standard_model_B' is not defined

<!-- The accuracy number given here represents the proportion of tokens in the output that are correct. This model does not however tell us whether the model is using a hierarchical or linear rule, since it does not focus on the accuracy of subject-verb agreement in particular. To determine this, we will need to look at accuracy of the prediction of verbs specifically. I have provided a function `test_verbs` in `analysis.py` that will compute the number of verbs that are correctly predicted and the number of sentences that are predicted entirely correctly. -->

In [57]:
import itertools

# Define the arrays
datasets = ['standard', 'aave', 'mix']
config = ['A', 'B', 'C', 'D']

# Loop through the Cartesian product
for dataset, cfg in itertools.product(datasets, config): 
    try:       
        plot_filtered_validation_and_test_accuracy(dataset=dataset, config=config)
    except:
        print("Dataset and Config combination does not exist.")

Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.
Dataset and Config combination does not exist.


<!-- This function also returns lists of (input, target) pairs for which the model got at least one verb correct (`correct` is the set of sentences that are perfect, and `incorrect` is the sentences in which there is at least one mistake). You can explore these examples using the `test_tense_example` function. -->


<!-- incorrect -->

<!-- correct -->