# Notebook for replicating our results *in Google Colab*

## Install packages and data

In [None]:
%pip install sentencepiece
%pip install transformers==4.4.2
%pip install torch==1.7.1
%pip install torchvision==0.8.2

In [None]:
!cd ~
!mkdir ./data
!mkdir ./data/cryptonite-official-split
!ls

In [None]:
import os
import sentencepiece
from transformers import AlbertTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration

if (not os.path.exists("./data/cryptonite-official-split/cryptonite-train.jsonl")):
    !wget https://github.com/aviaefrat/cryptonite/raw/main/data/cryptonite-official-split.zip
    !unzip -d ./data/cryptonite-official-split/ cryptonite-official-split.zip
if (not os.path.exists("revClues_aggregated")):
    !cd ~/data
    !wget https://raw.githubusercontent.com/oconnnors/cs224u_crossword/main/reversalClues/revClues_aggregated
    !cd ~
if (not os.path.exists("guardian_train.json")):
    !wget https://github.com/oconnnors/cs224u_crossword/raw/main/decrypt-main/data/clue_json/guardian/naive_random/test.json
    !wget https://github.com/oconnnors/cs224u_crossword/raw/main/decrypt-main/data/clue_json/guardian/naive_random/val.json
    !wget https://github.com/oconnnors/cs224u_crossword/raw/main/decrypt-main/data/clue_json/guardian/naive_random/train.json
    !mv train.json guardian_train.json
    !mv val.json guardian_val.json
    !mv test.json guardian_test.json

# Main.py

In [None]:
import torch
class crossDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
def data2seq2seq(filepath, cap=-1):
  with open(filepath, 'r') as json_file:
    json_list = list(json_file)

  input_sequences = [] 
  output_sequences = []
  i = 0
  for json_str in json_list:
      if (cap != -1 and i > cap):
        break
      result = json.loads(json_str)
      input_sequences.append(result['clue'])
      output_sequences.append(result['answer'])
      #train_dataset = zip(input_sequences, output_sequences)
      i+=1
  return input_sequences, output_sequences

In [None]:
def guardian_data2seq2seq(filepath,cap=-1):
  input_sequences = [] 
  output_sequences = []
  with open(filepath) as f:
    allData = json.loads(f.readline())
    i = 0
    for curDatum in allData:
      if (cap != -1 and i > cap):
        break
      input_sequences.append(curDatum["input"])
      output_sequences.append(curDatum["target"])
      i+=1
  return input_sequences, output_sequences

In [None]:
def makeDataset(tokenizer, input_sequences, output_sequences):
  encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
  )

  input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

  # encode the targets
  target_encoding = tokenizer(
      output_sequences, padding="longest", max_length=max_target_length, truncation=True
  )
  labels = target_encoding.input_ids

  # replace padding token id's of the labels by -100 so it's ignored by the loss
  labels = torch.tensor(labels)
  labels[labels == tokenizer.pad_token_id] = -100

  return crossDataset(encoding, labels)

# Choose datasets and train model


Use `usingCryptonite=True` for cryptonite dataset, `usingBoth=True` for the combined dataset, and neither for guardian only.

In [31]:
usingCryptonite = False
usingBoth = False
assert(not (usingBoth and usingCryptonite))

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Adafactor, Trainer, TrainingArguments
import torch
import time
import json
import pandas as pds


tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
adafactor_optimizer = Adafactor(model.parameters(), relative_step=True, warmup_init=True, lr=None)
training_args = TrainingArguments(
    output_dir = "./",
    per_device_train_batch_size=4
)

# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128


# encode the inputs
task_prefix = "solve cryptic crossword:"

# Make training datasett
if usingCryptonite:
  t_input, t_output = data2seq2seq('data/cryptonite-official-split/cryptonite-train.jsonl', cap=40000)
  e_input, e_output = data2seq2seq('data/cryptonite-official-split/cryptonite-val.jsonl', cap=2600)
elif usingBoth:
  t1_input, t1_output = data2seq2seq('data/cryptonite-official-split/cryptonite-train.jsonl', cap=40000)
  e1_input, e1_output = data2seq2seq('data/cryptonite-official-split/cryptonite-val.jsonl', cap=2600)
  t2_input, t2_output = guardian_data2seq2seq('guardian_train.json', cap=42000)
  e2_input, e2_output = guardian_data2seq2seq('guardian_val.json',cap=2800)

  t_input = t1_input + t2_input
  t_output = t1_output + t2_output
  e_input = e1_input + e2_input
  e_output = e1_output + e2_output
else:
  t_input, t_output = guardian_data2seq2seq('guardian_train.json', cap=42000)
  e_input, e_output = guardian_data2seq2seq('guardian_val.json',cap=2800)
  
train_dataset = makeDataset(tokenizer, t_input, t_output)
eval_dataset = makeDataset(tokenizer, e_input, e_output)

In [None]:
trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            #optimizer=(adafactor_optimizer)
            #compute_metrics=compute_metrics,
)
trainer.train()
#loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
#loss.item()

In [None]:
trainer.evaluate(eval_dataset)

In [None]:
device = "cuda:0"
model = model.to(device)

In [32]:
if usingCryptonite:
  test_inputs, test_outputs = data2seq2seq('data/cryptonite-official-split/cryptonite-test.jsonl', cap=2600)
elif usingBoth:
  test_inputs1, test_outputs1 = data2seq2seq('data/cryptonite-official-split/cryptonite-test.jsonl', cap=2600)
  test_inputs2, test_outputs2 = guardian_data2seq2seq("guardian_test.json", cap=2800)
  test_inputs = test_inputs1 + test_inputs2
  test_outputs = test_outputs1 + test_outputs2
else:
  test_inputs, test_outputs = guardian_data2seq2seq("guardian_test.json", cap=2800)

## Testing
Set `useOurTest=True` to indicate using the Reversal Clues dataset we aggregated. Otherwise the native test split for the dataset will be used.

In [36]:
useOurTest = True

In [None]:
if useOurTest:
  our_test_clues = []
  our_test_answers = []
  with open("revClues_jankyAgg", 'r') as jankyFin:
    counter = 0
    allLines = jankyFin.readlines()
    for curline in allLines:
      counter += 1
      if counter % 2 == 0:
        our_test_answers.append(curline.strip())
      else:
        our_test_clues.append(curline.strip())
    jankyFin.close()
  if(our_test_clues[0] == "only a short reflection from jeremiah (4)" 
      and our_test_answers[0] == "mere"):
    print("success")

In [38]:
if useOurTest:
  cur_test_inputs = our_test_clues
  cur_test_answers = our_test_answers
else:
  cur_test_inputs = test_inputs
  cur_test_answers = test_outputs
# inference
testInput = tokenizer(
    cur_test_inputs, padding="longest", max_length=max_target_length, truncation=True, return_tensors="pt"
).input_ids  # Batch size 1
testInput = testInput.to(device)
test_gen_outputs = model.generate(testInput)



In [None]:
correct = 0
correct_preds = []
incorrect_preds = []
for i in range(len(test_gen_outputs)):
  curPred = tokenizer.decode(test_gen_outputs[i], skip_special_tokens=True)
  curAns = cur_test_answers[i]
  if curPred.strip() == curAns.strip():
    correct_preds.append({
        "clue":cur_test_inputs[i],
        "answer":cur_test_answers[i],
    })
    correct += 1
  elif useOurTest:
    incorrect_preds.append({
        "clue":cur_test_inputs[i],
        "answer":curPred.strip(),
    })
print("Correct predictions: ", correct)
print(correct/len(test_gen_outputs))
print(correct_preds)

## *Optional*
We can take out the predictions we got wrong for further analysis.

In [None]:
fout_name = "our-results-cng.json"
with open(fout_name, 'w') as fout:
  fout.write(json.dumps(incorrect_preds))
  fout.close()
from google.colab import files
files.download(fout_name)


Export the model to pickle file. **Set the filename!**

In [None]:
import pickle
pickle_filename = 't5small-cryptonite-and-guardian.sav'
pickle.dump(model, open(pickle_filename, 'wb'))

from google.colab import files
files.download(pickle_filename)