In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os, os.path

from collections import defaultdict

import torch
import torch.nn as nn
import transformers
import re

from transformers import (AutoTokenizer,
                          AutoModelForCausalLM)
from peft import (PeftModel,
                  PeftConfig)

import spacy
import random
import copy
import nltk
from nltk.tokenize import sent_tokenize
from shutil import copyfile

import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# put your file path here
path_to_nlu_dir = "/content/drive/MyDrive/NLU/Final Project/NLU_FinalProject/"

data_dir = path_to_nlu_dir+"Data/JSONL_Formatted/"
#data_path = "RACE-H/RACE-H_v1_tst.jsonl"
#data_name = 'RACE-H_FineTuneV1_InfoAlterationSent_test' #InfoAlterationSyn_test'
#data_path = "SAT_ACT/SATACT_v3_tst.jsonl"
#data_name = 'SATACT_FineTuneV1_InfoAlterationSent_test'

save_dir = path_to_nlu_dir+"Results/v2_Results/"

model_name = "gpt2-xl" #baseline
#model_name = "Salm00n/gpt2-xl_RACE-H_v1" #v1 race-h fine-tuned
#model_name = "Salm00n/gpt2-xl_SATACT_v1" #v1 sat/act fine-tuned
BATCH_SIZE = 1

In [None]:
data_list1 = ["RACE-H/RACE-H_v1_trn.jsonl", "RACE-H/RACE-H_v1_dev.jsonl", "RACE-H/RACE-H_v1_tst.jsonl"]
data_list2 = ["SAT_ACT/SATACT_v3_trn.jsonl", "SAT_ACT/SATACT_v3_dev.jsonl", "SAT_ACT/SATACT_v3_tst.jsonl"]

In [None]:
def hltag(data):
    U = set(['just', 'being', 'able', 'over', 'mainly', 'still', 'yet', 'seemed', 'whose', 'based', 'also', 'writer', 'had', 'should', 'to', 'sometimesd', 'has', 'might', 'then', 'very', 'ones', 'whether', 'not', 'during', 'now', 'realize', 'did', 'this', 't', 'each', 'where', 'because', 'doing', 'some', 'likely', 'are', 'further', 'really', 'even', 'what', 'said', 'for', 'lots', 'since', 'please', 'does', 'between', 'probably', 'ever', 'either', 'available', 'be', 'recently', 'however', 'here', 'although', 'by', 'both', 'about', 'anything', 'of', 'could', 'title', 'according', 's', 'or', 'among', 'already', 'suddenly', 'seems', 'simply', 'passage', 'from', 'would', 'whom', 'there', 'been', 'few', 'too', 'was', 'until', 'that', 'but', 'else', 'with', 'than', 'those', 'must', 'showed', 'these', 'will', 'while', 'can', 'were', 'following', 'and', 'do', 'almost', 'is', 'it', 'an', 'as', 'at', 'have', 'seem', 'if', 'again', 'author', 'rather', 'when', 'how', 'other', 'which', 'instead', 'several', 'though', 'may', 'who', 'most', 'such', 'why', 'recent', 'a', 'don', 'especially', 'maybe', 'perhaps', 'so', 'the', 'having', 'nearly'])
    nlp = spacy.load('en_core_web_sm')
    salientPosList = ['NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB',
                      'RBR', 'RBS', 'CD', 'FW']  # 21 core pos tag
    output = []
    for i in range(len(data)):
        if not data[i][0] or not isinstance(data[i][0], list):
            print(data[i])
            continue
        article = ' '.join(data[i][0])
        if not article.strip():
            continue
        #print(article)
        article = nlp(article)

        for j in range(len(data[i][1])):
            d = copy.deepcopy(data[i])
            d[0] = []

            question = data[i][1][j]["question"]
            if not question.strip():
                print(data[i])
                continue
            question = nlp(question)

            for k in range(len(data[i][1][j]["choice"])):
                key = set()
                for token in question:
                    if token.tag_ in salientPosList and token.text.lower() not in U:
                        key.add(token.text.lower())
                choice = data[i][1][j]["choice"][k]
                if not isinstance(choice, str) or not choice.strip():
                    print(data[i])
                    continue
                choice = nlp(choice)

                for token in choice:
                    if token.tag_ in salientPosList:
                        key.add(token.text.lower())
                articleatt = []
                for token in article:
                    if token.tag_ in salientPosList and token.text.lower() in key:
                        articleatt += ['[[HL]]']
                        articleatt += [token.text]
                        articleatt += ['[[/HL]]']
                    else:
                        articleatt += [token.text]
            d[0] += [' '.join(articleatt)]
            #print(d[0])
            d[1] = [data[i][1][j]]
            output.append(d)
    return output

In [None]:
def preprocess(data_list):
    splits = ["sft_trn", "sft_dev", "sft_tst"]
    for n in range(len(data_list)):
      print("preprocessing:", data_list[n])
      file_path = data_dir + data_list[n]
      output = []
      d1 = splits[n]

      with open(file_path, "r") as f:
        for line in f:
          data = json.loads(line)
          d = [[data["context"]], [], d1]
          q = {
              "question": data["question"],
              "choice": [data["answerA"], data["answerB"], data["answerC"], data["answerD"]],
              "answer": data[f"answer{data['correct']}"]
          }
          d[1].append(q)
          output.append(d)

        print(d1, "before highlighting:", len(output))
        output = hltag(output)
        print(d1, "after highlighting:", len(output))

        with open(file_path + '_' + d1, "w") as f:
            json.dump(output, f, indent=2)

In [None]:
preprocess(data_list1)

preprocessing: RACE-H/RACE-H_v1_trn.jsonl
sft_trn before highlighting: 62445
[['One hundred and thirteen million Americans have at least one bank-issued credit card. They give their owners automatic credit in stores, restaurants, and hotels, at home, across the country, and even abroad, and they make many banking services available as well. More and more of these credit cards can be read automatically, making it possible to withdraw or deposit money in scattered locations, whether or not the local branch bank is open. For many of us the "cashless society" is not on the horizon----it\'s already here.\nWhile computers offer these conveniences to consumers, they have many advantages for sellers too. Electronic cash registers can do much more than simply _ . They can keep a wide range of records, including who sold what, when, and to whom. This information allows businessmen to keep track of their list of goods by showing which items are being sold and how fast they are moving. Decisions t

In [None]:
preprocess(data_list2)

preprocessing: SAT_ACT/SATACT_v3_trn.jsonl
sft_trn before highlighting: 919
[['Many of William Shakespeare’s tragedies address broad themes that still appeal to today’s audiences. For instance, Romeo and Juliet, which is set in the Italy of Shakespeare’s time, tackles the themes of parents versus children and love versus hate, and the play continues to be read and produced widely around the world. But understanding Shakespeare’s so-called history plays can require a knowledge of several centuries of English history. Consequently,   _  '], [{'question': 'many theatergoers and readers today are likely to find Shakespeare’s history plays less engaging than the tragedies.', 'choice': ['some of Shakespeare’s tragedies are more relevant to today’s audiences than twentieth-century plays.', 'Romeo and Juliet is the most thematically accessible of all Shakespeare’s tragedies.', 'experts in English history tend to prefer Shakespeare’s history plays to his other works.', None], 'answer': 'some of

In [None]:
def read_contexts(split):
  contexts = []
  with open(data_dir + split, "r") as f:
    for line in f:
      obj = json.loads(line)
      contexts.append(obj["context"])
  return contexts

In [None]:
def problem_gen(article, id):
    delimiter = '_[[#@]]_'

    def get_cloze(sentence, words):
        cloze = []
        ans = []
        dis = []
        sentences = sentence.split(delimiter)
        tokens = []

        for i, sent in enumerate(sentences):
            tokens += nltk.word_tokenize(sent)
            if i != len(sentences) - 1:
                tokens.append(delimiter)

        if len(tokens) > 50:
            return None

        used = set()
        n_cloze = min((len(tokens)-2) // 6, 4)

        if len(tokens) >= 6 and n_cloze <= 0:
            if len(tokens) >= 6:
                n_cloze = 1

        if n_cloze <= 0:
            return None

        n_cloze = random.randint(1, n_cloze)

        for _ in range(n_cloze):
            while True:
                cloze_len = random.randint(1, 4)
                left = random.randint(0, len(tokens)-cloze_len)

                if any(j in used for j in range(left, left + cloze_len)):
                  continue

                if not all(tokens[j].isalpha() for j in range(left, left + cloze_len)):
                  continue

                for j in range(left, left + cloze_len):
                  used.add(j)

                cloze.append([left, left + cloze_len])
                ans.append(' '.join(tokens[left:left + cloze_len]))
                break

        if not ans:
            return None

        for a in ans:
            dislen = max(1, random.randint(len(a.split()) - 1, len(a.split()) + 1))
            dislis = []
            for _ in range(3):
                while True:
                    start = random.randint(0, len(words)-dislen)
                    d = ' '.join(words[start:start+dislen])
                    if d != a and d not in dislis:
                        dislis.append(d)
                        break

            dis.append(dislis)

        for left, right in cloze:
            for j in range(left, right):
                tokens[j] = ''
            tokens[left] = '_'

        ret = [' '.join(' '.join(tokens).split()), ', '.join(ans)]
        for i in range(3):
            ret.append(', '.join(dis[i] for dis in dis))

        return ret

    d = [[], [], id]
    article = article.replace(delimiter, '')
    sentences_raw = sent_tokenize(article)
    sentences = [[s, idx] for idx, s in enumerate(sentences_raw)]
    words = [x for x in nltk.word_tokenize(article) if x.isalpha()]

    n_problem = min(10, len(words) // 30)
    for _ in tqdm(range(n_problem), desc=f"Generating problems for {id}"):
        random.shuffle(sentences)
        selected = [s[0] for s in sentences[:random.randint(1, 3)]]
        question = get_cloze(delimiter.join(selected), words)

        if question is not None:
            q = {"question": ' '.join(question[0].replace(delimiter, '').split()), "choice": question[1:]}
            if any(existing["question"] == q["question"] for existing in d[1]):
                continue

            if len(set(q["choice"])) != 4:
                continue

            random.shuffle(q["choice"])
            q["answer"] = question[1]
            d[1].append(q)

    sentences.sort(key = lambda x : x[1])
    d[0].append(' '.join([s[0] for s in sentences]))
    return d

In [None]:
cloze_data1 = ["RACE-H/RACE-H_v1_trn.jsonl"]
cloze_split = ["sftc_trn"]
#cloze_data1 = ["RACE-H/RACE-H_v1_dev.jsonl"]
#cloze_split = ["sftc_dev"]

for n in range(len(cloze_data1)):
  fn = cloze_split[n]
  dn = cloze_data1[n]
  output = []
  data = read_contexts(dn)
  #data = data[:357] #skip idx 357
  #data = data[358:2284] #skip idx 2284
  #data = data[2285:5000]
  #data = data[20000:25000]
  #data = data[30000:34217] #skip idx 34217
  #data = data[34218:35000]
  #data = data[50000:51215] #skip idx 51215
  #data = data[51216:53148] #skip idx 53148-53150
  data = data[53151:55000]

  for i, context in enumerate(data):
    output.append(problem_gen(context, str(i)))

  print(fn, "generated:", len(output), "total questions:", sum(len(item[1]) for item in output))
  output = hltag(output)

  print(fn, "after highlighting:", len(output))

  with open(data_dir + dn + '_' + fn + '_53151-55000', "w") as f:
    json.dump(output, f, indent=2)

Generating problems for 0: 100%|██████████| 10/10 [00:00<00:00, 1812.42it/s]
Generating problems for 1: 100%|██████████| 10/10 [00:00<00:00, 2074.74it/s]
Generating problems for 2: 100%|██████████| 10/10 [00:00<00:00, 1195.23it/s]
Generating problems for 3: 100%|██████████| 10/10 [00:00<00:00, 1627.02it/s]
Generating problems for 4: 100%|██████████| 10/10 [00:00<00:00, 1129.35it/s]
Generating problems for 5: 100%|██████████| 10/10 [00:00<00:00, 1251.28it/s]
Generating problems for 6: 100%|██████████| 10/10 [00:00<00:00, 1767.44it/s]
Generating problems for 7: 100%|██████████| 10/10 [00:00<00:00, 1839.61it/s]
Generating problems for 8: 100%|██████████| 10/10 [00:00<00:00, 1342.69it/s]
Generating problems for 9: 100%|██████████| 10/10 [00:00<00:00, 1649.35it/s]
Generating problems for 10: 100%|██████████| 10/10 [00:00<00:00, 1330.81it/s]
Generating problems for 11: 100%|██████████| 9/9 [00:00<00:00, 1809.97it/s]
Generating problems for 12: 100%|██████████| 9/9 [00:00<00:00, 1581.03it/s]


sftc_trn generated: 1849 total questions: 10430
sftc_trn after highlighting: 10430


In [None]:
copyfile(data_dir + "RACE-H/sftc_RACE-H_v1_dev.jsonl", data_dir + "RACE-H/sftc_RACE-H_v1_tst.jsonl")

'/content/drive/MyDrive/NLU/Final Project/NLU_FinalProject/Data/JSONL_Formatted/RACE-H/sftc_RACE-H_v1_tst.jsonl'

In [None]:
cloze_data2 = ["SAT_ACT/SATACT_v3_trn.jsonl", "SAT_ACT/SATACT_v3_dev.jsonl"]
cloze_split = ["sftc_trn", "sftc_dev"]

for n in range(len(cloze_data2)):
  fn = cloze_split[n]
  dn = cloze_data2[n]
  output = []
  data = read_contexts(dn)

  for i, context in enumerate(data):
    output.append(problem_gen(context, str(i)))

  print(fn, "generated:", len(output), "total questions:", sum(len(item[1]) for item in output))

  output = hltag(output)

  print(fn, "after highlighting:", len(output))

  with open(data_dir + dn + '_' + fn, "w") as f:
    json.dump(output, f, indent=2)

sftc_trn generated: 919 total questions: 1075
sftc_trn after highlighting: 1075
sftc_dev generated: 131 total questions: 161
sftc_dev after highlighting: 161


In [None]:
copyfile(data_dir + "SAT_ACT/sftc2_SATACT_v3_dev.jsonl", data_dir + "SAT_ACT/sftc2_SATACT_v3_tst.jsonl")

'/content/drive/MyDrive/NLU/Final Project/NLU_FinalProject/Data/JSONL_Formatted/SAT_ACT/sftc2_SATACT_v3_tst.jsonl'