# Import Libraries

In [None]:
import os
import random
import re
from string import punctuation
import sys
import math
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from matplotlib import collections as mc
import logging
import argparse
from tqdm import tqdm
import io
from math import log,exp,sqrt

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

import numpy as np
from numpy import ndarray
import pandas as pd
import torch as T
from typing import List, Dict, Tuple, Type, Union
from torch import Tensor, device
from torch.autograd import Variable
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoConfig, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from statistics import mean

from nltk.corpus import stopwords
from nltk.stem.porter import *
import json
import nltk
nltk.download('stopwords')

from nltk import FreqDist
nltk.download('gutenberg')
from nltk.corpus import gutenberg

cos = T.nn.CosineSimilarity(dim=0)

# Set Environmental Variables

In [None]:
PROPS_OF_TEXT_TO_READ_FROM = [0.5] # proportion of the text passage to not blank out
NUM_OF_CHOICES = 4 # number of answer choices per fill-in
DISTRACTOR_POOL = 32

CONTEXTUAL_EMBEDDING_LAYERS = [4] # layers to sum contextual embeddings over
SIM_ANNEAL_EMB_WEIGHT = 8#4 # how much to weigh embedding distance vs. word likelihoods (increasing this value decreases plausibility while increasing diversity)
CORRECTNESS_SENT = 1 # how much to weight probability of tokens in sentence-context model
PLAUSIBILITY_DISTR_TO_ANS = 8#.25

MAX_QUESTIONS_BY_N = 16
INTERVAL_BETWEEN_WORDS = 7
EXTEND_SUBWORDS = True # attempt to grow show short, infrequent (or unrecognized) distractor tokens by adding suffixes until they become words or until a limit is reached.
MAX_SUBWORDS = 3 # max number of suffixes to try adding before creating a valid word
DISTRACTORS_FROM_TEXT = False
EXTEND_BEAM_WIDTH = 5
MIN_SENT_WORDS = 7

DEBUG_OUTPUT = True

CACHE_DIR = 
MODEL_TYPE = "roberta-large" # Masked LM huggingface model to use

In [None]:
# Maintain reproducibility
T.manual_seed(0)
np.random.seed(0)

# Set Up Quality-Check Tools for Distractors

In [None]:
# Frequencies are used to decide if a distractor candidate might be a subword
stemmer = PorterStemmer()
freq = FreqDist(i.lower() for i in gutenberg.words())
print(freq.most_common()[:5])

In [None]:
words_unix = set(line.strip() for line in open('dict-unix.txt'))
words_info = set(line.strip() for line in open('dict-info.txt'))
words_small = words_unix.intersection(words_info)
words_large = words_unix.union(words_info)
f = open('profanity.json')
profanity = json.load(f)


# Load Model and Tokenizer

In [None]:
model = AutoModelForMaskedLM.from_pretrained(MODEL_TYPE, cache_dir=CACHE_DIR)
model.cuda()
toker = AutoTokenizer.from_pretrained(MODEL_TYPE, add_prefix_space=True)
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize', model_dir='/data/ondovbd/stanza_resources')

nltk_sent_toker = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
model.cuda()

In [None]:
def is_word(str):
    '''Check if word exists in dictionary'''
    splt = str.lower().split("'")
    if len(splt) > 2:
        return False
    elif len(splt) == 2:
        return is_word(splt[0]) and (splt[1] in ['t','nt','s','ll'])
    elif '-' in str:
        for word in str.split('-'):
            if not is_word(word):
                return False
        return True
    else:
        return str.lower() in words_unix or str.lower() in words_info

In [None]:
sorted_toker_vocab_dict = sorted(toker.vocab.items(), key=lambda x:x[1])
suffix_mask = T.FloatTensor([1 if (('Ġ' != x[0][0]) and (re.match("^[A-Za-z0-9']*$", x[0]) is not None)) else 0 for x in sorted_toker_vocab_dict]) # 1 means is-suffix and 0 mean not-suffix
suffix_mask_inv = suffix_mask * -1 + 1
word_mask = suffix_mask_inv*T.FloatTensor([1 if is_word(x[0][1:]) and x[0][1:].lower() not in profanity else 0 for x in sorted_toker_vocab_dict])
suffix_mask=suffix_mask.cuda()
suffix_mask_inv=suffix_mask_inv.cuda()
word_mask = word_mask.cuda()


# Helper Functions

In [None]:
def get_emb(snt_toks, tgt_toks, layers=None):
    '''Embeds a group of subword tokens in place of a mask, using the entire
    sentence for context. Returns the average of the target token embeddings,
    which are summed over the hidden layers.

    snt_toks: the tokenized sentence, including the mask token
    tgt_toks: the tokens (subwords) to replace the mask token
    layers (optional): which hidden layers to sum (list of indices)'''
    mask_idx = snt_toks.index(toker.mask_token_id)
    snt_toks = snt_toks.copy()

    while mask_idx + len(tgt_toks)-1 >= 512:
        # Shift text by 100 words
        snt_toks = snt_toks[100:]
        mask_idx -= 100
    
    snt_toks[mask_idx:mask_idx+1] = tgt_toks
    snt_toks = snt_toks[:512]
    with T.no_grad():
        output = model(T.tensor([snt_toks]).cuda(), T.tensor([[1]*len(snt_toks)]).cuda(), output_hidden_states=True)
    layers = CONTEXTUAL_EMBEDDING_LAYERS if layers is None else layers
    output = T.stack([output.hidden_states[i] for i in layers]).sum(0).squeeze()
    # Only select the tokens that constitute the requested word
    return output[mask_idx:mask_idx+len(tgt_toks)].mean(dim=0)

In [None]:
def extend(toks_sent, toks_para, suff_ids, n_masks, ctx_words):
    '''Get the most likely suffix for a subword in the mask position
    
    tokens: the tokens of the sentence with its one original mask token
    sub_ids: the current list of subwords to replace the mask token with'''
    sm_sent = get_softmax_logits(toks_sent, n_masks, suff_ids)
    sm_para = get_softmax_logits(toks_para, n_masks, suff_ids)
    
    sm = T.exp((sm_sent[-1].log()+sm_para[-1].log())/2)
    topk_pfx = T.topk(sm*suffix_mask_inv, EXTEND_BEAM_WIDTH)
    best_ids = []
    best_prob = 0
    
    for i in range(len(topk_pfx.indices)):
        dec = toker.decode([topk_pfx.indices[i]]+suff_ids).strip()
        if is_word(dec) or dec in ctx_words:
            best_ids = [topk_pfx.indices[i]]
            best_prob = topk_pfx.values[i]
            break
    
    if n_masks > 1:
        topk_sfx = T.topk(sm*suffix_mask, EXTEND_BEAM_WIDTH)
        
        for i in range(EXTEND_BEAM_WIDTH):
            rec_suff_ids = [int(topk_sfx.indices[i])] + suff_ids
            ids_sfx, prob_sfx = extend(toks_sent, toks_para, rec_suff_ids, n_masks-1, ctx_words)
            
            if prob_sfx > best_prob:
                best_ids = ids_sfx + [int(topk_sfx.indices[i])]
                best_prob = prob_sfx
        
    return best_ids, best_prob

In [None]:
def energy(ctx, scaled_dists, scaled_sims, choices, words, ans):
    
    #Calculate and add cosine similarity scores 
    '''Cost function to help choose best distractors'''
    #e = [embs[i] for i in choices] #+ [sem_emb_ans]
    #w = [words[i] for i in choices] #+ [ans]
    
    hm_sim = 0
    e_ctx = 0
    for i in choices:
        hm_sim += 1./scaled_sims[i]
        e_ctx += ctx[i]
    
    e_sim = float(len(choices))/hm_sim
    
    hm_emb = 0
    count = 0
    c = choices + [len(ctx)]
    for i in range(len(c)):
        for j in range(i):
            d = scaled_dists['%s-%s'%(max(c[i],c[j]), min(c[i], c[j]))]
            #print(c[i], c[j], d)
            hm_emb += 1./d
            count += 1
    e_emb = float(count)/hm_emb
    return float(e_emb), e_ctx, float(e_sim)

In [None]:
def anneal(probs_sent_context, probs_para_context, embs, emb_ans, words, k, ans):
    '''find k distractor indices that are optimally high probability and distant
    in embedding space'''
#    probs_sent_context = T.as_tensor(probs_sent_context) / sum(probs_sent_context)
    m = len(probs_sent_context)
#    probs_para_context = T.as_tensor(probs_para_context) / sum(probs_para_context)
    its = 1000
    n = len(probs_para_context)
    choices = list(range(k))
    
    dists = {}
    embsa = embs + [emb_ans]
    for i in range(len(embsa)):
        for j in range(i):
            dists['%s-%s'%(i,j)] = 1-cos(embsa[i], embsa[j]) # cosine "distance"
            #print(words[i], words[j], 1-cos(embs[i], embs[j]))
    
    dist_min = T.min(T.tensor(list(dists.values())))
    dist_max = T.max(T.tensor(list(dists.values())))
    for key, dist in dists.items():
        dists[key] = (dist - dist_min)/(dist_max-dist_min)
    
    sims = T.tensor([cos(emb_ans, emb) for emb in embs])
    scaled_sims = (sims - T.min(sims))/(T.max(sims)-T.min(sims))
    
    ctx = T.tensor(probs_sent_context).log()-ALPHA*T.tensor(probs_para_context).log()
    ctx = (ctx-T.min(ctx))/(T.max(ctx)-T.min(ctx))
    
    e_emb, e_ctx, e_sim = energy(ctx, dists, scaled_sims, choices, words, ans)
    e = e_ctx + BETA * e_emb
    #e = SIM_ANNEAL_EMB_WEIGHT * e_emb + e_prob
    for i in range(its):
        t = 1.-(i)/its
        mut_idx = random.randrange(k) # which choice to mutate
        orig = choices[mut_idx]
        new = orig
        while (new in choices): # mutate choice until not in current list
            new = random.randrange(m)
        choices[mut_idx] = new
        e_emb, e_ctx, e_sim = energy(ctx, dists, scaled_sims, choices, words, ans)
        e_new = e_ctx + BETA * e_emb
        delta = e_new - e
        exponent = delta/t
        if exponent < -50:
            exponent = -50 # avoid underflow
        if delta > 0 or math.exp(exponent) > random.random():
            e = e_new # accept new state
        else:
            choices[mut_idx] = orig
    if DEBUG_OUTPUT:
        print([words[j] for j in choices] + [ans], "e: %f"%(e))
    return choices

# Distractor Generator Method

In [None]:
def get_softmax_logits(toks, n_masks = 1, sub_ids = []):
    # Tokenize text - Keep length of inpts at or below 512 (including answer token length artifically added at end)
    msk_idx = toks.index(toker.mask_token_id)
    toks = toks.copy()
    toks[msk_idx:msk_idx+1] = [toker.mask_token_id] * n_masks + sub_ids
    
    # If the masked_token is over 512 (excluding answer token length artifically added at end) tokens away
    while msk_idx >= 512:
        # Shift text by 100 words
        toks = toks[100:]
        msk_idx -= 100
    toks = toks[:512]
    # Find the predicted words for the fill-in-the-blank mask term based on sentence-context alone
    with T.no_grad():
        output = model(T.tensor([toks]).cuda(), T.tensor([[1]*len(toks)]).cuda())
    sm = T.softmax(output.logits[0, msk_idx:msk_idx+n_masks, :], dim=1)
    return sm

In [None]:
e=1e-10

def candidates(text, answer):
    '''Create list of unique distractors that does not include the actual answer'''
    if DEBUG_OUTPUT:
        print(text)
    
    # Get only sentence with blanked text to tokenize
    doc = nlp(text)
    #sents = [sentence.text for sentence in doc.sentences]
    sents = nltk_sent_toker.tokenize(text)
    msk_snt_idx = [i for i in range(len(sents)) if '<mask>' in sents[i]][0]
    just_masked_sentence = sents[msk_snt_idx]
    
    prv_snts = sents[:msk_snt_idx]
    nxt_snts = sents[msk_snt_idx+1:]
    
    if len(just_masked_sentence.split(' ')) < MIN_SENT_WORDS and len(prv_snts):
        just_masked_sentence = ' '.join([prv_snts.pop(), just_masked_sentence])
    
    while len(just_masked_sentence.split(' ')) < MIN_SENT_WORDS and (len(prv_snts) or len(nxt_snts)):
        if T.rand(1) < 0.5 and len(prv_snts):
            just_masked_sentence = ' '.join([prv_snts.pop(), just_masked_sentence])
        elif len(nxt_snts):
            just_masked_sentence = ' '.join([just_masked_sentence, nxt_snts.pop(0)])
    
    ctx = just_masked_sentence
    while len(ctx.split(' ')) < 3 * len(just_masked_sentence.split(' ')) and (len(prv_snts) or len(nxt_snts)):
        if len(prv_snts):
            ctx = ' '.join([prv_snts.pop(), ctx])
        if len(nxt_snts):
            ctx = ' '.join([ctx, nxt_snts.pop(0)])
    
#    just_masked_sentence = ' '.join([just_masked_sentence.replace('<mask>', 'banana'),
#                                     just_masked_sentence.replace('<mask>', 'banana'),
##                                     just_masked_sentence,
  #                                   just_masked_sentence.replace('<mask>', 'banana'),
   #                                  just_masked_sentence.replace('<mask>', 'banana')])
    #just_masked_sentence = ' '.join([just_masked_sentence, just_masked_sentence, just_masked_sentence, just_masked_sentence, just_masked_sentence])
    
    tiled = just_masked_sentence
    while len(tiled) < len(text):
        tiled += ' ' + just_masked_sentence
    just_masked_sentence = tiled
    
    if DEBUG_OUTPUT:
        print(ctx)
        print(just_masked_sentence)
    toks_para = toker.encode(text)
    toks_sent = toker.encode(just_masked_sentence)
    # Get softmaxed logits from sentence alone and full-text
#    sent_sm, sent_pos, sent_ids = get_span_logits(just_masked_sentence, answer)
#    para_sm, para_pos, para_ids = get_span_logits(text, answer)
    
    sent_sms_all = []
    para_sms_all = []
    para_sms_right = []
    
    for i in range(MAX_SUBWORDS):
        para_sms = get_softmax_logits(toks_para, i + 1)
        para_sms_all.append(para_sms)
        sent_sms = get_softmax_logits(toks_sent, i + 1)
        sent_sms_all.append(sent_sms)
        para_sms_right.append(T.exp((sent_sms[i].log()+para_sms[i].log())/2) * (suffix_mask_inv if i == 0 else suffix_mask))
    
    # Create 2 lists: (1) notes highest probability for each token across n-mask lists if token is suffix and (2) notes number of mask terms to add
    para_sm_best, para_pos_best = T.max(T.vstack(para_sms_right), 0)
    
    distractors = []
    stems = []
    embs = []
    sent_probs = []
    para_probs = []
    
    ans_stem = stemmer.stem(answer.lower())
    
    emb_ans = get_emb(toks_para, toker(answer)['input_ids'][1:-1])
    para_words = text.lower().split(' ')
    blank_word_idx = [idx for idx, word in enumerate(para_words) if '<mask>' in word][0] # Need to remove punctuation
    if (blank_word_idx - 1) < 0:
        prev_word = 'beforeanytext'
    else:
        prev_word = para_words[blank_word_idx-1]
    if (blank_word_idx + 1) >= len(para_words):
        next_word = 'afteralltext'
    else:
        next_word = para_words[blank_word_idx+1]
    
    # Need to check if the token is outside of the tokenizer based on predictions being made at all
    if len(para_sms_all[0]) > 0:
        top_ctx = T.topk((sent_sms_all[0][0]*word_mask+e).log() - ALPHA * (para_sms_all[0][0]*word_mask+e).log(), len(para_sms_all[0][0]), dim=0)
        para_top_ids = top_ctx.indices.tolist()
        para_top_probs = top_ctx.values.tolist()
        
        for i, id in enumerate(para_top_ids):
            
            sub_ids = [int(id)] # cumulative list of subword token ids
            dec = toker.decode(sub_ids).strip()
            if DEBUG_OUTPUT:
                print('Trying:', dec)
            #print(para_pos[id])
            #if para_pos_best[id] > 0:
            #    continue
            
            if dec.isupper() != answer.isupper():
                continue
            
            if EXTEND_SUBWORDS and para_pos_best[id] > 0:
                if DEBUG_OUTPUT:
                    print("Extending %s with %d masks..."%(dec, para_pos_best[id]))
                ext_ids, _ = extend(toks_sent, toks_para, [id], para_pos_best[id], para_words)
                sub_ids = ext_ids + sub_ids
                dec_ext = toker.decode(sub_ids).strip()
                if DEBUG_OUTPUT:
                    print("Extended %s to %s"%(dec, dec_ext))
                if is_word(dec_ext) or (dec_ext != '' and dec_ext in para_words):
                    dec = dec_ext # choose new word
                else:
                    sub_ids = [int(id)] # reset
            
            if len(toker.decode(sub_ids).lower().strip()) < 2:
                continue
                
            if dec[0].isupper() != answer[0].isupper():
                continue
            
            # Only add distractor if it does not contain punctuation
            #if any(p in dec for p in punctuation):
            #    pass
                #continue
            
            if dec.lower() in profanity:
                continue
            
            # make sure is a word, either in dict or somewhere else in text
            if not is_word(dec) and dec.lower() not in para_words:
                continue

            # make sure is not the same as an adjacent word
            if dec.lower() == prev_word or dec.lower() == next_word:
                continue
            
            # Don't add the distractor if stem matches another
            stem = stemmer.stem(dec).lower()
            if stem in stems or stem == ans_stem:
                continue

            # Only add distractor if it does not contain a number
            if any(char.isdigit() for char in toker.decode([id])):
                continue

            # Only add distractor if the distractor exists in the text already
            if DISTRACTORS_FROM_TEXT and dec.lower() not in para_words:
                continue
            
            #if answer[0].isupper():
            #    dec = dec.capitalize()
            
            # PASSED ALL TESTS; finally add distractor and computations
            distractors.append(dec)
            stems.append(stem)
            sent_logprob = 0
            para_logprob = 0
            nsubs = len(sub_ids)
            for j in range(nsubs):
                sub_id = sub_ids[j]
                sent_logprob_j = log(sent_sms_all[nsubs-1][j][sub_id])
                para_logprob_j = log(para_sms_all[nsubs-1][j][sub_id])
                #if j == 0 or sent_logprob_j > sent_logprob:
                #    sent_logprob = sent_logprob_j
                #if j == 0 or para_logprob_j > para_logprob:
                #    para_logprob = para_logprob_j
                sent_logprob += sent_logprob_j
                para_logprob += para_logprob_j
            sent_logprob /= nsubs
            para_logprob /= nsubs
            if DEBUG_OUTPUT:
                print("%s (p_sent=%f, p_para=%f)"%(dec,sent_logprob,para_logprob))
            sent_probs.append(exp(sent_logprob))
            para_probs.append(exp(para_logprob))
#            sent_probs.append(sent_sms_all[nsubs-1][nsubs-1][sub_id])
#            para_probs.append(para_sms_all[nsubs-1][nsubs-1][sub_id])
            embs.append(get_emb(toks_para, sub_ids))

            if len(distractors) >= DISTRACTOR_POOL:
                break
    if DEBUG_OUTPUT:
        print('Corresponding Text: ', text)
        print('Correct Answer: ', answer)
        print('Distractors created before annealing: ', distractors)
    #indices = anneal(sent_probs, para_probs, embs, emb_ans, number_of_distractors, distractors, answer)
    #distractors = [distractors[i] for i in indices]
    #distractors += [''] * (number_of_distractors - len(distractors))
        
    return sent_probs, para_probs, embs, emb_ans, distractors

In [None]:
def create_distractors(text, answer):
    sent_probs, para_probs, embs, emb_ans, distractors = candidates(text, answer)
    #print(distractors)
    indices = anneal(sent_probs, para_probs, embs, emb_ans, distractors, 3, answer)
    return [distractors[x] for x in indices]

In [None]:
def score_positions(text):
    sents = nltk_sent_toker.tokenize(text)
    msk_snt_idx = [i for i in range(len(sents)) if '<mask>' in sents[i]][0]
    just_masked_sentence = sents[msk_snt_idx]
    
    prv_snts = sents[:msk_snt_idx]
    nxt_snts = sents[msk_snt_idx+1:]
    
    i = 0
    while len(just_masked_sentence.split(' ')) < MIN_SENT_WORDS and (len(prv_snts) or len(nxt_snts)):
        if i % 2 == 0 and len(prv_snts):
            just_masked_sentence = ' '.join([prv_snts.pop(), just_masked_sentence])
        elif len(nxt_snts):
            just_masked_sentence = ' '.join([just_masked_sentence, nxt_snts.pop(0)])
        i += 1
    
#    tiled = just_masked_sentence
#    while len(tiled) < len(text):
#        tiled += ' ' + just_masked_sentence
#    just_masked_sentence = tiled
    
    if DEBUG_OUTPUT:
        print(just_masked_sentence)
    toks_para = toker.encode(text)
    toks_sent = toker.encode(just_masked_sentence)
    
    para_sms = get_softmax_logits(toks_para, 1)[0]
    sent_sms = get_softmax_logits(toks_sent, 1)[0]
    ctx = (sent_sms*word_mask+e).log()-ALPHA*(para_sms*word_mask+e).log()
    tk = T.topk(ctx, DISTRACTOR_POOL, dim=0)
    top_ids = tk.indices.tolist()
    top_probs = tk.values
    
    #inc = T.index_select(sent_sms, 0, tk.indices) - ALPHA*T.index_select(para_sms, 0, tk.indices)
    #for i, idx in enumerate(top_ids):
        #print(idx, sorted_toker_vocab_dict[idx], inc[i])
    return T.sum(top_probs)

In [None]:
def mask(word, cdgp = False):
    strp = word.strip(punctuation)
    return word.replace(strp, '[MASK]' if cdgp else '<mask>')

In [None]:
def insert_answer(distractors, answer):
    idx = random.randint(0,3)
    distractors.insert(idx, answer)
    return distractors, idx2ans[idx]

stop_words = set(stopwords.words('english'))
DEBUG_OUTPUT=False
DISTRACTOR_POOL = 128

def choose_and_blank(text, count):
    words = text.split()
    scores = []
    for i, word in enumerate(words):
        masked = mask(words[i])
        t = ' '.join(words[:i]+[masked]+words[i+1:])
        scores.append(float(score_positions(t)))
    print(scores)
    blanks = []
    answers = set()
    while(len(blanks) < count):
        tk = T.topk(T.tensor(scores), 1)
        idx = tk.indices[0]
        strp = words[idx].strip(punctuation)
        if words[idx] == 'a(n' or words[idx][0].isupper() or any(char.isdigit() for char in strp) or words[idx] in answers or words[idx].lower() in stop_words:
            scores[idx] = 0
        else:
            blanks.append(idx)
            answers.add(strp)
            for i in range(max(idx-4,0),min(idx+5,len(scores))):
                scores[i] = 0
    dists = []
    answers = []
    for blank in sorted(blanks):
        masked = mask(words[blank])
        t = ' '.join(words[:blank]+[masked]+words[blank+1:])
        strp = words[blank].strip(punctuation)
        ds = create_distractors(t, strp)
        ds, letter = insert_answer(ds, strp)
        dists.append(ds)
        answers.append(letter)
        print(ds, letter)
    for blank in sorted(blanks):
        masked = mask(words[blank])
        words[blank] = masked.replace('<mask>', '_')
    return ' '.join(words), dists, answers

In [None]:
stop_words = set(stopwords.words('english'))
DEBUG_OUTPUT=False
DISTRACTOR_POOL = 128
    
def choose_blanks(text, count):
    words = text.split()
    scores = []
    for i, word in enumerate(words):
        masked = mask(words[i])
        t = ' '.join(words[:i]+[masked]+words[i+1:])
        sents = nltk_sent_toker.tokenize(t)
        scores.append(float(score_positions(t)))
    scores = (T.tensor(scores)/1000).sigmoid()
    blanks = []
    blank_scores = []
    answers = set()
    c = 0
    while(len(blanks) < count):
        tk = T.topk(scores, 1)
        idx = tk.indices[0]
        strp = words[idx].strip(punctuation)
        if tk.values[0] > 0 and (
            len(strp) == 0 or
            strp == 'a(n' or # no a(n)
            strp[0].isupper() or # no Capitalized words
            any(char.isdigit() for char in strp) or # no digits
            any(char in punctuation.replace("'", '').replace('-', '') for char in strp) or # no internal punctuation
            strp in answers or # no repeats
            strp.lower() in stop_words): # no stopwords
            scores[idx] = 0
        else:
            blanks.append(idx)
            blank_scores.append(scores[idx])
            answers.add(strp)
            for i in range(max(idx-MIN_DIST+1,0),min(idx+MIN_DIST,len(scores))):
                scores[i] = 0
    print(len(blanks), list(zip(blanks, blank_scores)))
    return sorted(blanks)

def create_distractors_for_blanks(text, blanks, cdgp=False):
    words = text.split()
    dists = []
    answers = []
    for blank in blanks:
        masked = mask(words[blank], cdgp)
        t = ' '.join(words[:blank]+[masked]+words[blank+1:])
        strp = words[blank].strip(punctuation)
        #print(t, strp)
        ds = create_distractors_cdgp(t, strp) if cdgp else create_distractors(t, strp)
        ds, letter = insert_answer(ds, strp)
        dists.append(ds)
        answers.append(letter)
        #print(blank, ds, letter)
    for blank in sorted(blanks):
        masked = mask(words[blank])
        words[blank] = masked.replace('[MASK]' if cdgp else '<mask>', '_')
    return ' '.join(words), dists, answers

def choose_and_blank(text, count):
    blanks = choose_blanks(text, count)
    return create_distractors_for_blanks(text, blanks)


In [None]:
text = """A sad little boy was in search of happiness and wanted to meet God. On his way, he saw an elderly woman sitting in a park watching some birds. The boy sat down next to her. He opened his bag to take a drink. He noticed that lady looked hungry, so he offered her a piece of cake. She accepted and smiled at him. Her smile was so wonderful that he wanted to see it again. Then he offered her a can of coke. Once again she smiled at him. The boy was pleased! They stayed there all afternoon, eating and drinking without saying a word. As it began to grow dark, the boy got up to leave, but before he had gone no more than a few steps, he turned around, ran back to the old woman and gave her a big hug. She gave him her biggest smile. When the boy arrived home, his mother was surprised by the look of joy on his face. She asked, "What has made you so happy today?" He replied, " I had lunch with God. She's got the most beautiful smile in the world!" And when the old woman returned to her home, she told her son that she had lunch with God. Too often we overlook the power of a touch, a smile, a kind word, a listening ear or the smallest act of caring. However, all of these have the possible power to turn a life around."""
choose_and_blank(text, 5)

In [None]:
def plot_emb(probs, embs, emb_ans, words, ans, ids):
    
    words = [ans]+words
    probs = [np.max(probs)]+probs
    embs = T.stack([emb_ans]+embs).cpu().numpy()
    #tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    #new_values = tsne_model.fit_transform(embs)
    pca = PCA(n_components=2)
    new_values = pca.fit(embs).transform(embs)
    
    ofst = np.min(probs)
    rnge = np.max(probs)-ofst
    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
    
    plt.figure(figsize=(24, 16)) 
    
    for i in range(len(x)):
        #plt.scatter(x[i],y[i], c='white')

        idx = i-1
        scale = 0.9*(probs[idx]-ofst)/rnge
        #print(scale)
        plt.text(x[i], y[i], words[i], ha='center', va='center', size='large', c='magenta' if i == 0 else 'black', alpha=scale if i > 0 else 1, bbox=dict(edgecolor='magenta' if (i == 0) else 'black',facecolor='white', alpha=0.5) if (i == 0) else None)
#        plt.text(x[i], y[i], words[i], alpha=-np.log(probs[i])/11)

        plt.scatter(x[i],y[i],c='white',s=500)#,alpha=1-np.log(probs[i])*0.75)#,c=probs[i])
    
    ids = [0]+[x+1 for x in ids]
    print(ids)
    n=len(ids)
    data = []
    for i in range(n):
        for j in range(i+1, n):
            x1 = []
            x2 = []
            y1 = []
            y2 = []
            id_i = ids[i]
            id_j = ids[j]
            #print(i, j, i%n, j%n, id_i, id_j)
            data.append((x[id_i], x[id_j]))
            data.append((y[id_i], y[id_j]))
    #plt.axis('off')
    plt.plot(*data, alpha=0.25 ,color = 'black')
    
    plt.savefig("myImagePDF.pdf", format="pdf", bbox_inches="tight")

    plt.show()

In [None]:
# test pool generation
DEBUG_OUTPUT = True
MAX_SUBWORDS = 3
text = """The Healing Jim and his wife, Connie, were shocked by the loss of their four-month-old son--Joshua, whose life was taken by SIDS--sudden infant death syndrome. Thirty hours ago Jim drove to the baby-sitter's home to pick up Joshua. It was a routine trip, like the one he made five days every week. He arrived, and little Joshua could not be awakened from his nap. The next few hours were a time of life and death: the racing ambulance, swift-moving doctors and nurses....but 12 hours later, at children's Hospital, though the doctors had exhausted all  attempts, little Joshua was gone. Yes, they wanted all of Joshua's usable organs to be donated. That was not a difficult decision for Jim and Connie, a loving and giving couple. The next morning dawned and many things had to be arranged. Telephone calls and funeral plans. At one point Jim realized he needed a haircut. When Jim settled into the chair at the barber's, he began to reflect on the past hours, trying to make some sense of it all. Why had Joshua, their first-born, the child they had waited so long for, been taken so soon....he had barely begun his life....The question kept coming, and the pain in Jim's heart just enveloped him. While talking with the barber, Jim mentioned the organ donations, looking at his watch: "They are transplanting one of his heart valves right now." The <mask> stopped and stood motionless. Finally she spoke, but it was only a whisper. "You're not going to believe this....but about an hour ago the customer sitting in this chair wanted me to hurry so she could get to Children's Hospital. She left here so full of joy....her prayers had been answered. Today her baby granddaughter is receiving a desperately needed transplant--a heart transplant." Jim's healing began."""
answer = 'hairdresser'

#text = """Most animals, including snakes and fish, yawn, but it is only contagious in humans and chimps and, according to a recent study, dogs. The researchers, from the University of London's Birbeck College, put 29 <mask> in a room with a yawning man and found that 21, or 72%, also started to yawn. They said the skill may allow the pet to build stronger bonds with their owners."""
#answer = 'dogs'

#text = """Mr. Frieden had this to say: "We won't be able to check travelers for fever when they leave or when they arrive. We won't be able, as we do presently, to take a detailed history to see if they were <mask> when they arrive. When they arrive, we wouldn't be able to impose quarantine as we now can if they have high-risk contact." """
#answer = 'exposed'

#text = """It is a time-proven fact that smile is a language. It is a universal language understood by the people of every nation, and the commonest way to show our good will perfectly without saying anything. One day I was shopping in a small town in California. It was my misfortune to be served by a clerk who seemed most unfriendly and not at all concerned about my intended purchase. I bought nothing, and walked angrily out of the store. My anger grew with each step. Outside, standing at the corner, was a young man in his early twenties. His expressive eyes met and held mine, and in the next instant a beautiful, amazing smile covered his face. I gave in immediately. The power of that shining smile drove away all my anger, and I found the muscles in my own face happily responding. "Beautiful day, isn't it?" I said. Then, suddenly something inside me sent me turning back. "I really owe you a debt,” I said softly. His smile deepened, but he made no attempt to answer. A Mexican woman nearby stepped forward and said, "Carlos can't speak English," she volunteered. "Shall I tell him something?" At that moment I felt changed. Carlos' smile had made a big person of me. "Yes," my <mask> was enthusiastic and sincere. "Tell him 'Thank you!'" "Thank you?" The woman seemed slightly puzzled. "Just tell him that," I insisted. "Surely, he'll understand." What a smile! Although I have never seen that young man again, I'll never forget the lesson he taught me. From then on, I became smile-conscious. I practice it diligently, anywhere and everywhere, with everybody. This action on my part would always draw a good-natured smile in return."""
#answer = 'reply'

#text = """Have you ever heard the story of the four-minute mile? For years people believed that it is impossible for a human being to run a mile in less than four minutes until Roger Banister proved it wrong in 1954. Within one year, 37 runners broke the belief barrier. And the year after that, 300 other runners did the same thing. What happens if you put an animal in a pond? Any animal, big or small, will swim its way through. What happens when someone, who does not know how to swim, falls in deep waters? You <mask>. If an animal who has not learned swimming could escape by swimming, why not you? Because you believe you will drown while the animal does not. These cases show the power of beliefs. There is no other more powerful force in directing human behavior than belief. Our beliefs have the power to create and to destroy. In a way it is our beliefs that determine how much we'll be able to realize our potential. So pay attention to some of your beliefs. Do you believe you are weak in mathematics? Do you believe that other people dislike you? Do you believe life is full of problems? Belief is not mysterious, however. It's nothing but the generalization of a past incident. As a kid, if a dog bit you, you believed all dogs to be dangerous. To change certain behavior, identify the beliefs associated with it. Change those beliefs and a new pattern is automatically created."""
#answer = 'drown'

#text = """Two months ago, there was a serious earthquake in my country. Many people were hurt and lost their homes during the disaster. In my school, we decided to organize a 5-kilometer run to collect money for the people. Students signed up for the run and asked their relatives and neighbors to support them. These people agreed to give some money--50 cents or a dollar, for example, for every kilometer that the students completed. Joe was my classmate. He was the heaviest student in my class because he seldom did any exercise and he ate plenty of junk food. He never walked to school. He always took a bus. When he was asked <mask> he was going to take part in the run, he said he would think about it. A few students laughed and I think Joe felt embarrassed. I felt a bit sorry for him. The next day, as I was riding to school, I saw Joe walking in the street. I stopped and asked him why he was walking. He said he was training to take part in the run. "Good for you, Joe!" I said. Later, I told my friends about Joe. Most of them just laughed and said that they didn't think he would complete the run. I wasn't sure, but I decided that I would help Joe. So, for the two weeks before the run, I was his trainer. He walked to school for a few days . Then he started to run slowly. On the day of the run, Joe lined up with the other students. The race began and soon Joe was left behind. Well, it took him hours to finish and he didn't expect to catch up with the other students, but he tried to do his best. And finally he completed the 5-kilometer run. Everyone was very happy and said, "Well done, Joe!" What is more, Joe collected more money than any other student! What a good lesson we should learn from."""
#answer = 'if'

#text = """Wishing to encourage her young son's progress on the piano, a mother took the small boy to a Paderewski concert. After they were seated, the mother saw a friend in the audience and walked down the aisle to greet her. Seizing the opportunity to explore the wonders of the concert hall, the little boy rose and eventually explored his way through a door <mask> "NO ADMITTANCE." When the house's lights dimmed and the concert was about to begin, the mother returned to her seat and discovered that her son was missing. Suddenly, the curtains parted and spotlights focused on the impressive Steinway on stage. In horror, the mother saw her little boy sitting at the keyboard, innocently picking out "Twinkle, Twinkle Little Star." At that moment, the great piano master made his entrance, quickly moved to the piano, and whispered in the boy's ear, "Don't quit." "Keep playing." Then leaning over, Paderewski reached down with his left hand and began filling in a bass part. Soon his right arm reached around to the other side of the child and he added a running obbligato. Together, the old master and the young novice transformed a frightening situation into a wonderfully creative experience. The audience was mesmerized. That's the way it is with God. What we can finish on our own is hardly noteworthy. We try our best, but the results aren't exactly flowing music. But with the hand of the Master, our life's work truly can be beautiful."""
#answer = 'marked'

#text = """Eating chocolate is a great thing for most of us kids. If we want to eat it, we should pay for it. But there is a job that needs someone to taste chocolate every day. Isn't it wonderful? Laura Fagan is a 29-year-old British girl. She tastes chocolate, desserts and cakes every day and she can be paid well for this! Every day Laura Fagan tries different kinds of desserts for her supermarket, Teseo. She needs to try as many as 20 desserts a day. Also, she always travels for work. She needs to travel to different cities to try different desserts. Although the job seems to be great, it is hard. Usually, Laura begins trying desserts as early as 8 a.m., and is still trying new desserts at 6 p.m. before she goes back home. Fortunately, the job hasn't made her too fat. She only worries about her teeth. "Of course I was afraid of becoming fat when I started the job, so I try to <mask> as often as possible in the gym. The main problem is my teeth. I don't think my dentists would be happy if I told them what I do, so I try to brush my teeth as often as possible." Laura loves her job although it is hard. "I can learn about new trends in food," she said."""
#answer = 'exercise'

#text="""What kind of homes will we live in in the future? Nobody can be sure, but scientists are working out new ideas now. Some scientists are thinking about building whole cities under huge glass domes. Of course advanced heating and cooling systems will be necessary to control the weather in the domes. Therefore, there will never be any rain or snow, and the temperature will always be comfortable. Perhaps everyone will live in vertical cities -- high rises that are so large that they can contain all the necessities of life. Since vertical cities will use less land than flat cities, and provide homes for more people, they will be practical for small countries that have a large population. <mask> idea that will be helpful to small countries is the floating city. Monaco has already built homes, stores, and offices on the water of the Mediterranean Sea. There are some people who think that we will go back to living in caves. But the caves of the future will be very different from the caves of the Stone Age. Farms and parks will be on the land over the cave city. When people want to go to the country or to a park, a short ride in a lift will take them there."""
#answer="Another"

#text = """I was born in the Great Depression to a carpenter father and a home keeping mother. Our family had a large garden, raised two pigs for their meat, and had a cow for her milk. When Dad lost his work, we started a laundry in our home. We hired up to seven girls at one time. Dad also rented 12 acres of nearby land to earn a few extra dollars. I was lucky to have good teachers to help in my learning. Following graduation, I spent two years in the army, allowing me to see Europe and build on my experiences. Then I returned home, got a job in the local bank and married my sweetheart. I was on my way to a normal lifestyle, but I was called back into the army when they built the Berlin Wall. That turned out to be a blessing too. As I hadn't worked long enough at the bank to begin health insurance, our oldest son, Ken, was born in an army hospital, which cost only $8.25. Following a year at Fort Chaffee, Arkansas, I returned home to work at the bank again. I worked my way up through the ranks to become Chairman, President, and the CEO of the bank before retiring in 1997. I was blessed to have had good employers and employees. I've been blessed to have a nice wife and wonderful children and grandchildren. I'm truly thankful to all I've met along the way. Today, I look <mask> on those grand experiences and see they were my possibilities to grow. I found many opportunities. They taught me to save for a rainy day. They taught me to help those less fortunate than me. They taught me to put others first, if I wanted to be successful in life."""
#answer = 'back'

text = """A lady once wrote a long story. She <mask> it to a famous editor. After a few weeks the editor returned the story to her. The lady was angry. She wrote back to the editor: "Dear Sir, Yesterday you sent back a story of mine. How do you know that the story is not good? You did not read it. Before I sent you the story, I pasted together pages 18, 19 and 20. This was a test to see whether you would read the story. When the story came back yesterday, the pages were still pasted together. Is this the way you read all the stories that are sent to you?" The editor wrote back: "Dear Madam, when I have an egg for breakfast, I don't have to eat the whole egg in order to discover that it is bad." """
answer = 'sent'

text="""My  position  was to pass out water to the runners. I remember being so excited to see all the different kinds of  runners  who passed by and   grabbed  a cup of water. The next year I signed up for the race  and gave it a  <mask>  .\nThe first 10,000 m race was quite an experience. I jogged, I walked, I jogged and I walked. At times, I didn't  know  if I could finish.\nAt one point near the end, a 70-year-old man ran past me very  fast  , and I felt embarrassed that I was 50 years younger than him and I couldn't even keep up with him."""
answer = 'shot'

#text = """I was born in the Great Depression to a carpenter father and a home keeping mother. Our family had a large garden, raised two pigs for their meat, and had a cow for her milk. When Dad lost his work, we started a laundry in our home. We hired up to seven girls at one time. Dad also rented 12 acres of nearby land to earn a few extra dollars. I was lucky to have good teachers to help in my learning. Following graduation, I spent two years in the army, allowing me to see Europe and build on my experiences. Then I returned home, got a job in the local bank and married my sweetheart. I was on my way to a normal lifestyle, but I was called back into the army when they built the Berlin Wall. That turned out to be a blessing too. As I hadn't worked long enough at the bank to begin health insurance, our oldest son, Ken, was born in an army hospital, which cost only $8.25. Following a year at Fort Chaffee, Arkansas, I returned home to work at the bank again. I worked my way up through the ranks to become Chairman, President, and the CEO of the bank before retiring in 1997. I was blessed to have had good employers and employees. I've been blessed to have a nice wife and wonderful children and <mask>. I'm truly thankful to all I've met along the way. Today, I look back on those grand experiences and see they were my possibilities to grow. I found many opportunities. They taught me to save for a rainy day. They taught me to help those less fortunate than me. They taught me to put others first, if I wanted to be successful in life."""
#answer = 'grandchildren'

MAX_SUBWORDS=1
DISTRACTOR_POOL = 32
DEBUG_OUTPUT=True
CONTEXTUAL_EMBEDDING_LAYERS = [12]
MIN_SENT_WORDS = 7
ALPHA = 0.3

sent_probs, para_probs, embs, emb_ans, distractors = candidates(text, answer)

In [None]:
# test annealing

SIM_ANNEAL_EMB_WEIGHT = 0.25#4 # how much to weigh embedding distance vs. word likelihoods (increasing this value decreases plausibility while increasing diversity)
CORRECTNESS_SENT = 1 # how much to weight probability of tokens in sentence-context model
PLAUSIBILITY_DISTR_TO_ANS = 1#.25
BETA=1
indices = anneal(sent_probs, para_probs, embs, emb_ans, distractors, 3, answer)
[distractors[x] for x in indices]

In [None]:
plot_emb((T.as_tensor(sent_probs).log()-ALPHA*T.as_tensor(para_probs).log()).cpu().numpy(), embs, emb_ans, distractors, answer, indices)

In [None]:
SIM_ANNEAL_EMB_WEIGHT = 0.5#4 # how much to weigh embedding distance vs. word likelihoods (increasing this value decreases plausibility while increasing diversity)
CORRECTNESS_SENT = 1 # how much to weight probability of tokens in sentence-context model
PLAUSIBILITY_DISTR_TO_ANS = 1#.25
MAX_SUBWORDS = 1
DEBUG_OUTPUT = True

text = """I was born in the Great Depression to a carpenter father and a home keeping mother. Our family had a large garden, raised two pigs for their meat, and had a cow for her milk. When Dad lost his work, we started a laundry in our home. We hired up to seven girls at one time. Dad also rented 12 acres of nearby land to earn a few extra dollars. I was lucky to have good teachers to help in my learning. Following graduation, I spent two years in the army, allowing me to see Europe and build on my experiences. Then I returned home, got a job in the local bank and married my sweetheart. I was on my way to a normal lifestyle, but I was called back into the army when they built the Berlin Wall. That turned out to be a blessing too. As I hadn't worked long enough at the bank to begin health insurance, our oldest son, Ken, was born in an army hospital, which cost only $8.25. Following a year at Fort Chaffee, Arkansas, I returned home to work at the bank again. I worked my way up through the ranks to become Chairman, President, and the CEO of the bank before retiring in 1997. I was blessed to have had good employers and employees. I've been blessed to have a nice wife and wonderful children and grandchildren. I'm truly thankful to all I've met along the way. Today, I look back on those grand experiences and see they were my possibilities to grow. I found many opportunities. They taught me to save for a rainy day. They taught me to help those less fortunate than me. They taught me to put others first, if I wanted to be successful in life."""
choose_and_blank(text, 10)

In [None]:
import json
ans2idx = {'A':0, 'B':1, 'C':2, 'D':3}
idx2ans = {0:'A', 1:'B', 2:'C', 3:'D'}
DEBUG_OUTPUT = False
DISTRACTOR_POOL=128
MAX_SUBWORDS = 1

files = {
    'high1':[
        'high1-3734.cdgp-m.json',
        'high2-3680.cdgp-m.json',
    ],
    'high2':[
        'high3-4041.cdgp-m.json',
        'high4-3780.cdgp-m.json'
    ],
    'high3':[
        'high5-3946.cdgp-m.json',
        'high6-4009.cdgp-m.json'
    ],
    'middle1':[
        'middle1-2914.cdgp-m.json',
        'middle2-2894.cdgp-m.json',
        'middle3-2991.cdgp-m.json',
        'middle4-2829.cdgp-m.json'
    ],
    'middle2':[
        'middle5-3010.cdgp-m.json',
        'middle6-2874.cdgp-m.json',
        'middle7-2699.cdgp-m.json',
        'middle8-2727.cdgp-m.json'
    ],
    'middle3':[
        'middle9-2989.cdgp-m.json',
        'middle10-2865.cdgp-m.json',
        'middle11-2915.cdgp-m.json',
        'middle12-2958.cdgp-m.json'
    ]
}

In [None]:
from transformers import pipeline
generator = pipeline("text2text-generation", model='Text2Text/text2text-t5-base/checkpoint-72000', device=0) 

In [None]:
from random import shuffle

def t5multi(text, answer):
    ds = generator(text.replace('<mask>', '_')+' [SEP] '+answer)[0]['generated_text'].split()
    if len(ds) < 3:
        ds *= 2
    shuffle(ds)
    return ds[:3]

In [None]:
text="""My  _  was to pass out water to the runners. I remember being so excited to see all the different kinds of  runners  who passed by and   grabbed  a cup of water. The next year I signed up for the race  and gave it a  <mask>  .\nThe first 10,000 m race was quite an experience. I jogged, I walked, I jogged and I walked. At times, I didn't  know  if I could finish.\nAt one point near the end, a 70-year-old man ran past me very  fast  , and I felt embarrassed that I was 50 years younger than him and I couldn't even keep up with him."""
answer = 'shot'

t5multi(text, answer)

## Add nCloze and t5-Multi distractors

In [None]:
DISTRACTOR_POOL = 32
ALPHA = 0.3
BETA = 0.3
MIN_SENT_WORDS = 7
MIN_DIST = 7

for block, fs in files.items():
    for file in fs:
        fn = 'cloth-test/%s/%s'%(block,file)
        base = '.'.join(fn.split('.')[:-2])
        print(base)
        f = open(fn)
        d = json.load(f)

        t = d['article']
        for i, a in enumerate(d['answers']):
            answer = d['options'][i][ans2idx[a]]
            t = t.replace('_', answer, 1)
        newtext, dists, answers = choose_and_blank(t, len(d['answers']))
        d['article-ncloze'] = newtext
        d['options-ncloze'] = dists
        d['answers-ncloze'] = answers
        print(newtext)
        
        dists = []
        distsT5 = []
        answers = []
        answersT5 = []
        for i, a in enumerate(d['answers-cdgp-m']):
            t = d['article-cdgp-m']
            for j, o in enumerate(d['options-cdgp-m']):
                if j == i:
                    t = t.replace('_', '<mask>', 1)
                else:
                    answer = o[ans2idx[d['answers-cdgp-m'][j]]]
                    t = t.replace('_', answer, 1)
            answer = d['options-cdgp-m'][i][ans2idx[a]]
            
            ds = create_distractors(t, answer)
            # insert the ansert randomly
            idx = random.randint(0,3)
            ds.insert(idx, answer)
            dists.append(ds)
            answers.append(idx2ans[idx])

            ds = t5multi(t, answer)
            # insert the ansert randomly
            idx = random.randint(0,3)
            ds.insert(idx, answer)
            distsT5.append(ds)
            answersT5.append(idx2ans[idx])
            
            #print(dists)
            #print(answers)
        d['options-ncloze-m'] = dists
        d['answers-ncloze-m'] = answers
        d['options-t5multi-m'] = distsT5
        d['answers-t5multi-m'] = answersT5

        #words = t.split()
        #if len(words) > 256:
        #    words = words[:256]
        #t = ' '.join(words)
        #print(t)
        o = open(base + '.all.json', 'w')
        o.write(json.dumps(d))
        o.close()


In [None]:
DEBUG_OUTPUT = False
DISTRACTOR_POOL=32
MAX_SUBWORDS = 1

for a in [0.3]:
    for deltab in [0.3]:
        print("a: %f, b: %f"%(a, b))

        ALPHA = a
        BETA = b
        
        for block, fs in files.items():
            for file in fs:
                fn = 'cloth-test/%s/%s'%(block,file)
                base = '.'.join(fn.split('.')[:-2])
                print(base)
                f = open(base+'.ncloze.json')
                d = json.load(f)
                f.close()

                t = d['article']
                for i, a in enumerate(d['answers']):
                    answer = d['options'][i][ans2idx[a]]
                    t = t.replace('_', answer, 1)
                words = t.split()
                newtext, dists, answers = choose_and_blank(t, len(d['answers']))
                d['article-ncloze-c'] = newtext
                d['options-ncloze-c'] = dists
                d['answers-ncloze-c'] = answers

                o = open(base + '.ncloze-c.p%f.d%f.json'%(p,delta), 'w')
                o.write(json.dumps(d))
                o.close()


## Hyperparameter tuning

In [None]:
def prepare(d):
    pairs = []

    for i, a in enumerate(d['answers']):
        t = d['article']
        for j, o in enumerate(d['options']):
            if j == i:
                t = t.replace('_', '<mask>', 1)
            else:
                answer = o[ans2idx[d['answers'][j]]]
                t = t.replace('_', answer, 1)
        ds = d['options'][i]
        ai = ans2idx[a]
        answer = ds[ai]
        ds = ds[:ai]+ds[ai+1:]
        pairs.append((t, ds, answer))
    return pairs

In [None]:
#import os
#import json
#import csv
#from random import random, shuffle

ans2idx = {'A':0, 'B':1, 'C':2, 'D':3}
idx2ans = {0:'A', 1:'B', 2:'C', 3:'D'}

data = []
for levl in ['middle', 'high']:
    prfx = './CLOTH/valid/%s/'%(levl)
    for file in os.listdir(prfx):
        f = open(prfx + file)
        d = json.load(f)
        if '(1)' in d['article']:
            print("Skipping %s..."%file)
        else:
            data.extend(prepare(d))


In [None]:
data[0]

In [None]:
sms_t = sms

In [None]:
sms = []
embs_val = []
i = 0
for pair in data[::10]:
    i += 1
    if not i % 1000:
        print(i)
    (text, ds, answer) = pair
    doc = nlp(text)
    #sents = [sentence.text for sentence in doc.sentences]
    sents = nltk_sent_toker.tokenize(text)
    msk_snt_idx = [i for i in range(len(sents)) if '<mask>' in sents[i]][0]
    just_masked_sentence = sents[msk_snt_idx]
    
    prv_snts = sents[:msk_snt_idx]
    nxt_snts = sents[msk_snt_idx+1:]
    
    if len(just_masked_sentence.split(' ')) < MIN_SENT_WORDS and len(prv_snts):
        just_masked_sentence = ' '.join([prv_snts.pop(), just_masked_sentence])
    
    while len(just_masked_sentence.split(' ')) < MIN_SENT_WORDS and (len(prv_snts) or len(nxt_snts)):
        if T.rand(1) < 0.5 and len(prv_snts):
            just_masked_sentence = ' '.join([prv_snts.pop(), just_masked_sentence])
        elif len(nxt_snts):
            just_masked_sentence = ' '.join([just_masked_sentence, nxt_snts.pop(0)])
    
    #tiled = just_masked_sentence
    #while len(tiled) < len(text):
    #    tiled += ' ' + just_masked_sentence
    #just_masked_sentence = tiled
    
    #print(just_masked_sentence)
    toks_para = toker.encode(text)
    toks_sent = toker.encode(just_masked_sentence)
    sm_para = get_softmax_logits(toks_para, n_masks = 1, sub_ids = [])
    sm_sent = get_softmax_logits(toks_sent, n_masks = 1, sub_ids = [])
    sms.append((sm_para, sm_sent))
    
    es = []
    for dist in ds:
        #print(dist, toker.encode(dist)[1:-1])
        es.append(get_emb(toks_para, toker.encode(dist)[1:-1]))
    es.append(get_emb(toks_para, toker.encode(answer)[1:-1]))
    embs_val.append(es)

print(sms[:5])

In [None]:
embs_val

In [None]:
tot = 0
for i in range(len(embs_val)):
    for j in range(len(embs_val[i])):
        for k in range(j):
            tot += -cos(embs_val[i][j], embs_val[i][k])

tot /= len(embs_val)

In [None]:
tot

In [None]:
ALPHA = 0.3
DISTRACTOR_POOL = 32
DEBUG_OUTPUT=False
bs = [.1, 0.3, 1, 3, 10]
totals = {}
correct = {}
for b in bs:
    totals[b] = 0
    correct[b] = 0

for datum in data[::10]:
    text, answer = datum[0], datum[2]
    sent_probs, para_probs, embs, emb_ans, distractors = candidates(text, answer)

    embs += [emb_ans]
    #print(datum)
    for b in bs:
        BETA = b
        
        indices = anneal(sent_probs, para_probs, embs, emb_ans, distractors, 3, answer)
        indices += [len(embs)-1]
        for j in range(len(indices)):
            for k in range(j):
                #print(j,k)
                totals[b] += - cos(embs[indices[j]], embs[indices[k]])
        #print(b, tot[b])
        dists = [distractors[x] for x in indices[:-1]]
        #print(dists)
        for dist in dists:
            if dist in datum[1]:
                correct[b] += 1

for b in bs:
    print(b, totals[b]/10)
    
correct

In [None]:
i = 0
for datum in data[::100]:
    i += 1
print(i)

In [None]:
print(data[2])

In [None]:
len(sms)

In [None]:
e=1e-10

for a in [0.1, 0.3, 1, 3, 10]:
    mrr = 0
    for i in range(len(sms)):
        (sm_para, sm_sent) = sms[i]
        score = (sm_sent[0]*word_mask+e).log() - a * (sm_para[0]*word_mask+e).log()
#        print(T.topk(score, 3).indices)
        #print([sorted_toker_vocab_dict[x] for x in list(T.topk(score, 10).indices)])
        #print(score.shape)
        order = T.argsort(score, dim=0).squeeze()+1
        #print(order.squeeze().shape)
        for dist in data[i][1]:
            tid = toker.encode(dist)[1]
            mrr += 1./order[tid]
        #break
    mrr /= 3*len(sms)
    print(a, mrr)


## Distractor scoring

In [None]:
def score(text, answer, dists):
    embs = []
    
    phis = []
    Phis = []

    doc = nlp(text)
    #sents = [sentence.text for sentence in doc.sentences]
    sents = nltk_sent_toker.tokenize(text)
    msk_snt_idx = [i for i in range(len(sents)) if '<mask>' in sents[i]][0]
    just_masked_sentence = sents[msk_snt_idx]

    prv_snts = sents[:msk_snt_idx]
    nxt_snts = sents[msk_snt_idx+1:]

    if len(just_masked_sentence.split(' ')) < MIN_SENT_WORDS and len(prv_snts):
        just_masked_sentence = ' '.join([prv_snts.pop(), just_masked_sentence])

    while len(just_masked_sentence.split(' ')) < MIN_SENT_WORDS and (len(prv_snts) or len(nxt_snts)):
        if T.rand(1) < 0.5 and len(prv_snts):
            just_masked_sentence = ' '.join([prv_snts.pop(), just_masked_sentence])
        elif len(nxt_snts):
            just_masked_sentence = ' '.join([just_masked_sentence, nxt_snts.pop(0)])

    #tiled = just_masked_sentence
    #while len(tiled) < len(text):
    #    tiled += ' ' + just_masked_sentence
    #just_masked_sentence = tiled

    #print(just_masked_sentence)
    toks_para = toker.encode(text)
    toks_sent = toker.encode(just_masked_sentence)
    sm_para = get_softmax_logits(toks_para, n_masks = 1, sub_ids = [])
    sm_sent = get_softmax_logits(toks_sent, n_masks = 1, sub_ids = [])

    for dist in dists:
        tok_dist = toker.encode(dist)[1:-1]
        phis.append(float(sm_sent[0][tok_dist].mean().log()))
        Phis.append(float(-sm_para[0][tok_dist].mean().log()))
        #print(dist, toker.encode(dist)[1:-1])
        embs.append(get_emb(toks_para, tok_dist))
    
    embs.append(get_emb(toks_para, toker.encode(answer)[1:-1]))
    delta = 0.
    for i in range(len(embs)):
        for j in range(i):
            delta += 1. - float(cos(embs[i], embs[j]))

    return phis, Phis, delta

In [None]:
np.sum([0, 1,2, 3])

In [None]:
cnd2art = {
    'cloth':'article',
    'cdgp-m':'article-cdgp-m',
    't5multi-m':'article-cdgp-m',
    'ncloze-m':'article-cdgp-m',
    'ncloze':'article-ncloze'
}

f = open('compiled.json')
d = json.load(f)
f.close()

od = open('scores-dist.tsv', 'w')
oq = open('scores-ques.tsv', 'w')

labl=['A', 'B', 'C', 'D']

for pasg in d:
    print(pasg)
    for cond in cnd2art:
        print('  ' + cond)
        key = ('' if cond == 'cloth' else '-'+cond)
        anss = d[pasg]['answers' + key]
        opts = d[pasg]['options' + key]
        arti = d[pasg][cnd2art[cond]]
        for i, ans in enumerate(anss):
            qkey = '%s-%s-%d'%(pasg, cond, i)
            t = arti
            aidx = ans2idx[ans]
            dists = opts[i][:aidx]+opts[i][aidx+1:]
            for j, opt in enumerate(opts):
                if j == i:
                    t = t.replace('_', '<mask>', 1)
                else:
                    t = t.replace('_', opt[ans2idx[anss[j]]], 1)
            sp = t.split('<br/><br/>')
            t = sp[0] if '<mask>' in sp[0] else sp[1]
            phis, Phis, delta = score(t, opts[i][aidx], dists)
            labls = labl[:aidx]+labl[aidx+1:]
            for j in range(len(phis)):
                od.write('\t'.join([qkey+'-'+labls[j], str(phis[j]), str(Phis[j])])+'\n')
            oq.write('\t'.join([qkey, str(np.sum(phis)), str(np.sum(Phis)), str(delta)])+'\n')
#        break
#    break
od.close()
oq.close()


# CDGP

In [None]:
from tqdm import tqdm
import os
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import numpy as np
import fasttext
import nltk
from nltk.tokenize import word_tokenize
import json
from huggingface_hub import hf_hub_download

In [None]:
# Global variables
CSG_MODEL = "AndyChiang/cdgp-csg-bert-cloth"
DS_MODEL = "./cdgp-ds-fasttext.bin"
TOP_K = 3
STOP_WORDS = ["[MASK]", "[SEP]", "[PAD]", "[CLS]"]
WEIGHT = {"s0": 0.6, "s1": 0.15, "s2": 0.15, "s3": 0.1}
# WEIGHT = {"s0": 0.25, "s1": 0.25, "s2": 0.25, "s3": 0.25}
QUESTION_LIMIT = 1000

In [None]:
# Load CSG model
print(f"Load CSG model at {CSG_MODEL}...")
tokenizer = BertTokenizer.from_pretrained(CSG_MODEL)
csg_model = BertForMaskedLM.from_pretrained(CSG_MODEL)

# Create a unmasker
unmasker = pipeline('fill-mask', tokenizer=tokenizer, model=csg_model, top_k=TOP_K)

# Load DS model
print(f"Load DS model at {DS_MODEL}...")
ds_model = fasttext.load_model(DS_MODEL)

In [None]:

# Generate distractors of one question
def generate_dis(unmasker, ds_model, sent, answer):
    # Answer relating
    target_sent = sent + " [SEP] " + answer

    # Generate Candidate Set
    cs = list()
    for cand in unmasker(target_sent):
        word = cand["token_str"].replace(" ", "")
        if len(word) > 0:  # Skip empty
            cs.append({"word": word, "s0": cand["score"], "s1": 0.0, "s2": 0.0, "s3": 0.0})

    # Confidence Score s0
    s0s = [c["s0"] for c in cs]
    new_s0s = min_max_y(s0s)

    for i, c in enumerate(cs):
        c["s0"] = new_s0s[i]

    # Word Embedding Similarity s1
    answer_vector = ds_model.get_word_vector(answer)
    word_similarities = list()
    for c in cs:
        c_vector = ds_model.get_word_vector(c["word"])
        word_similarity = similarity(answer_vector, c_vector)   # Cosine similarity between A and Di
        word_similarities.append(word_similarity)

    new_similarities = min_max_y(word_similarities)

    for i, c in enumerate(cs):
        c["s1"] = 1-new_similarities[i]

    # Contextual-Sentence Embedding Similarity s2
    correct_sent = sent.replace('[MASK]', answer)
    correct_sent_vector = ds_model.get_sentence_vector(correct_sent)

    cand_sents = list()
    for c in cs:
        cand_sents.append(sent.replace('[MASK]', c["word"]))

    sent_similarities = list()
    for cand_sent in cand_sents:
        cand_sent_vector = ds_model.get_sentence_vector(cand_sent)
        sent_similarity = similarity(correct_sent_vector, cand_sent_vector) # Cosine similarity between S(A) and S(Di)
        sent_similarities.append(sent_similarity)

    new_similarities = min_max_y(sent_similarities)
    for i, c in enumerate(cs):
        c["s2"] = 1-new_similarities[i]

    # POS match score s3
    origin_token = word_tokenize(sent)
    origin_token.remove("[")
    origin_token.remove("]")

    mask_index = origin_token.index("MASK")
    
    correct_token = word_tokenize(correct_sent)
    correct_pos = nltk.pos_tag(correct_token)
    answer_pos = correct_pos[mask_index]    # POS of A

    for i, c in enumerate(cs):
        cand_sent_token = word_tokenize(cand_sents[i])
        cand_sent_pos = nltk.pos_tag(cand_sent_token)
        cand_pos = cand_sent_pos[mask_index]    # POS of Di

        if cand_pos[1] == answer_pos[1]:
            c["s3"] = 1.0
        else:
            c["s3"] = 0.0

    # Weighted final score
    cs_rank = list()
    for c in cs:
        fs = WEIGHT["s0"]*c["s0"] + WEIGHT["s1"]*c["s1"] + WEIGHT["s2"]*c["s2"] + WEIGHT["s3"]*c["s3"]
        cs_rank.append((c["word"], fs))

    # Rank by final score
    cs_rank.sort(key=lambda x: x[1], reverse=True)

    # Top K
    result = [d[0] for d in cs_rank[:TOP_K]]

    return result

def CDGP(text, answer):
    return generate_dis(unmasker, ds_model, text, answer)

# Cosine similarity
def similarity(v1, v2):
    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)
    if n1 == 0 or n2 == 0:  # Denominator can not be zero
        return 1
    else:
        return np.dot(v1, v2) / (n1 * n2)

# Min–max normalization
def min_max_y(raw_data):
    min_max_data = []

    # Min–max normalization
    for d in raw_data:
        min_max_data.append((d - min(raw_data)) / (max(raw_data) - min(raw_data)))

    return min_max_data

In [None]:
text = """Perhaps because I was city kid, my exposure to wildlife was limited. That changed when I moved to the wooded hills of Oregon [MASK] years later. For the first time, I met animal communities. One evening, a nursing raccoon with four kids appeared. She extended her tiny paw as if asking for some food. I was attracted by their cuteness, so I instantly put out a serving of fresh cat food and water. She returned the next evening. And the next. All was well until the wildlife began behaving wildly. The raccoons started crying noisily. They could be heard throughout the entire valley. A few days later, our homeowners association newsletter arrived in the mail. Among the usual announcements of garage sales came a gentle reminder that feeding the wildlife was not a(n) suitable thing to do. My face became red with embarrassment as I read the letter. I'd been found out! I was now identified as the trouble maker! I went downstairs to discuss the matter with my husband. "I'm not surprised that the association has come up with a policy about it. They must have gotten complaints," he said. "OK, I'm going to stop feeding the animals," I said. Although I told myself that the wildlife around me would survive without cat food, I felt guilty. Late that night, I walked slowly into the kitchen for a snack. Then a scene outside attracted my attention: There, on the hillside, was my neighbor. She was feeding two deer in the cold."""
text = "They would never [MASK] to see their family again."
answer = 'want'
ds = CDGP(text, answer)
ds

# Methods to Choose and Preprocess Chosen Target Term(s) from a Full Text

In [None]:
# def create_spaced_multiple_questions_with_choices_from_sentences(text, index_of_starting_word=20, number_of_distractors=4, interval_between_words=INTERVAL_BETWEEN_WORDS, max_number_of_questions = MAX_QUESTIONS_BY_N):
#
blanked_texts = {}
#     number_of_questions = 0
#     text = text.strip()
    
#     # Create list of all words with punctuation
#     all_words = text.split(' ')
#     full_masked_words = all_words.copy()
#     for j in range(index_of_starting_word, len(all_words), interval_between_words):
        
#         # Create new list of full words and replace ONLY ONE target term in it
#         words = all_words.copy()
        
#         # Replace target term with a mask
#         target_term = words[j]
#         # Check for punctuation and then replace target term (without punctuation) with a mask
#         if target_term.strip(punctuation) != target_term:
        
#             # Only mask the part of the word not connected to punctuation
#             split_list = re.split("(" + str(target_term.strip(punctuation)) + ")", target_term)
#             for i in range(len(split_list)):
#                 if split_list[i] == target_term.strip(punctuation):
#                     target_term = split_list[i]
#                     split_list[i] = toker.mask_token
#                     break
#             words[j] = ''.join(split_list)
#             # Replace and accumulate ALL target terms in full_masked_words
#             full_masked_words[j] = ''.join(split_list)
#         else:
#             words[j] = toker.mask_token
#             # Replace and accumulate ALL target terms in full_masked_words
#             full_masked_words[j] = toker.mask_token
            
#         number_of_questions += 1
        
        
#         blanked_text = ' '.join(words)
#         blanked_text = blanked_text + " "
#         blanked_text = blanked_text.replace('nothingtoseehere. ', '')
#         blanked_text = blanked_text.strip()
#         list_of_choices = create_distractors(blanked_text, target_term, number_of_distractors)
#         print('Distractors chosen after annealing:', list_of_choices)
            
#         # Insert target term (correct answer) in front
#         list_of_choices.insert(0, target_term)
        
#         blanked_texts[number_of_questions] = list_of_choices
#         if len(blanked_texts) >= max_number_of_questions:
#             break
        
#     full_masked_text = ' '.join(full_masked_words)
#     return pd.Series([number_of_questions, blanked_texts, full_masked_text])

In [None]:
def create_multiple_questions_with_choices_from_sentences_by_distractors(row, number_of_distractors=3):
    blanked_texts = {}
    number_of_questions = 0
    text = row['text'].strip()
    all_masks_text = row['masked_text']
    
    # Create list of all words with punctuation
    all_words = text.split(' ')
    all_masked_words = all_masks_text.split(' ')
    for word_index, word in enumerate(all_masked_words):
        
        # Only for masked words
        if "<mask>" in word:
        
            # Create new list of full text and replace ONLY ONE target term in it
            all_words_copy = all_words.copy()
            # Replace target term with a mask
            target_term = all_words_copy[word_index]
            # Check for punctuation and then replace target term (without punctuation) with a mask
            if target_term.strip(punctuation) != target_term:
                # Only mask the part of the word not connected to punctuation
                split_list = re.split("(" + str(target_term.strip(punctuation)) + ")", target_term)
                for i in range(len(split_list)):
                    if split_list[i] == target_term.strip(punctuation):
                        target_term = split_list[i]
                        split_list[i] = toker.mask_token
                        break
                all_words_copy[word_index] = ''.join(split_list)
                
            else:
                all_words_copy[word_index] = toker.mask_token
                
            number_of_questions += 1
            one_mask_text = ' '.join(all_words_copy)
            one_mask_text = one_mask_text.strip()
            list_of_choices = anneal(create_distractors(one_mask_text, target_term)+(number_of_distractors, target_term))
            print('Distractors chosen after annealing:', list_of_choices)
            
            # Insert target term (correct answer) in front
            list_of_choices.insert(0, target_term)
            
            blanked_texts[number_of_questions] = list_of_choices
    return pd.Series([number_of_questions, blanked_texts])

# Generate Cloze Tasks with Adaptation Texts

In [None]:
# Import dataframe
df = pd.read_csv("Cloth_Text_Distractors.csv")

In [None]:
# Create new dataframe to hold the file to use for creating the Turker Task
turker_input_df = df
# Delete unnecessary columns
turker_input_df = turker_input_df[["pmid", "text", "masked_text"]]

In [None]:
turker_input_df = turker_input_df.head(1)

turker_input_df.loc[0, 'masked_text'] = """
The Healing Jim and his wife, Connie, were shocked by the loss of their four-month-old son--Joshua, whose life was taken by SIDS--sudden infant death syndrome. Thirty hours ago Jim drove to the baby-sitter's home to pick up Joshua. It was a routine trip, like the one he made five days every week. He arrived, and little Joshua could not be awakened from his nap. The next few hours were a time of life and death: the racing ambulance, swift-moving doctors and nurses....but 12 hours later, at children's Hospital, though the doctors had exhausted all  attempts, little Joshua was gone. Yes, they wanted all of Joshua's usable organs to be donated. That was not a difficult decision for Jim and Connie, a loving and giving couple. The next morning dawned and many things had to be arranged. Telephone calls and funeral plans. At one point Jim realized he needed a haircut. When Jim settled into the chair at the barber's, he began to reflect on the past hours, trying to make some sense of it all. Why had Joshua, their first-born, the child they had waited so long for, been taken so soon....he had barely begun his life....The question kept coming, and the pain in Jim's heart just enveloped him. While talking with the barber, Jim mentioned the organ donations, looking at his watch: "They are transplanting one of his heart valves right now." The hairdresser stopped and stood motionless. Finally she spoke, but it was only a whisper. "You're not going to believe this....but about an hour ago the customer sitting in this chair wanted me to hurry so she could get to Children's Hospital. She left here so full of joy....her prayers had been answered. Today her baby granddaughter is receiving a desperately needed transplant--a heart <mask>." Jim's healing began.
"""

# turker_input_df.loc[1, 'masked_text'] = """
# In 1977, a dead author of detective stories saved the life of a 19-month-old baby in a most unusual way. The author was Agatha Christie, one of the most successful writers of detective stories in the world. In June 1977, a baby girl became seriously ill in Qatar, near Saudi Arabia. Doctors were unable to <mask> the cause of her illness, so she was flown to London and admitted to Hammersmith Hospital, where specialist help was <mask>. She was then only half-conscious and on the "Dangerously Ill" list. A team of doctors hurried to examine the baby only to discover that they, too, were puzzled by the very unusual symptoms. While they were discussing the baby's case, a nurse asked to speak to them. "Excuse me," said nurse Marsha Maitland, "but I think the baby is suffering from thallium poisoning." "What makes you think that?" Dr. Brown asked. "Thallium poisoning is extremely rare." "A few days ago, I was reading a novel called by Agatha Christie," Nurse Maitland explained. "In the book, somebody uses thallium poison, and all the symptoms are described. They are exactly the same as the baby's." "You're very observant and you may be right," another doctor said. "We'll carry out some tests and find out whether it's thallium or not." The tests showed that the baby had indeed been poisoned by thallium, a rare metal used in making optical glass. Once they knew the cause of illness, the doctors were able to give the correct treatment. The baby soon recovered and was sent back to Qatar. Inquiries showed that the poison might have come from an insecticide used in Qatar.
# """

In [None]:
number_of_choices = NUM_OF_CHOICES

# Create columns with output of cloze_question_distractor_generator method
turker_input_df[['number_of_questions', 'full_nth_info']] = turker_input_df.apply(lambda x: create_multiple_questions_with_choices_from_sentences_by_distractors(x), axis=1)

# Create blanked version of masked_text texts
turker_input_df['blanked_text'] = turker_input_df['masked_text'].apply(lambda x: x.replace("<mask>", "_"))

# Find new maximum of questions
MAX_QUESTIONS_BY_N = turker_input_df['number_of_questions'].max()

# Create new dataframe that is just lists of all the answer choices
all_answer_choices_df = turker_input_df['full_nth_info'].apply(lambda x: pd.Series(x))

# Create column names to replace old column names
column_names_to_add = ['answer_choices_word' + str(i) for i in all_answer_choices_df.columns]
keys_to_add = list(range(1, MAX_QUESTIONS_BY_N+1))

# Rename dataframe
renamed_all_answer_choices_df = all_answer_choices_df.rename(columns=dict(zip(keys_to_add, column_names_to_add)))

#turker_input_df[list(renamed_all_answer_choices_df.columns)] = renamed_all_answer_choices_df
for chosen_word_number in range(MAX_QUESTIONS_BY_N):
    # Fill all empty values with lists
    renamed_all_answer_choices_df['answer_choices_word' + str(chosen_word_number+1)] = renamed_all_answer_choices_df['answer_choices_word' + str(chosen_word_number+1)].apply(lambda d: d if isinstance(d, list) else [''] * NUM_OF_CHOICES)

    # Add renamed column of answer choices to turker_input_df
    turker_input_df['answer_choices_word' + str(chosen_word_number+1)] = renamed_all_answer_choices_df['answer_choices_word' + str(chosen_word_number+1)]

    all_answers_df = pd.DataFrame(renamed_all_answer_choices_df['answer_choices_word' + str(chosen_word_number+1)].tolist(), index=renamed_all_answer_choices_df.index)
    turker_input_df['correct_answer_word' + str(chosen_word_number+1)] = all_answers_df[0]
    for i in range(number_of_choices):
        turker_input_df['choice_' + str(i+1) + '_word' + str(chosen_word_number+1)] = all_answers_df[i]


# Reset index
turker_input_df = turker_input_df.reset_index(drop=True)

for chosen_word_number in range(MAX_QUESTIONS_BY_N):
    # Shuffle the answers for the first set of questions
    columns_to_shuffle = ["choice_" + str(x+1) + "_word" + str(chosen_word_number+1) for x in range(number_of_choices)]
    start_columms_indices_to_shuffle = turker_input_df.columns.get_loc("choice_1" + '_word' + str(chosen_word_number+1))
    for index, row in turker_input_df[columns_to_shuffle].iterrows():
        turker_input_df.iloc[index, start_columms_indices_to_shuffle:(start_columms_indices_to_shuffle + len(columns_to_shuffle))] = row.sample(frac=1, random_state=random.randint(1, 500)).reset_index(drop=True)

# Create new series that is just lists of all the texts
masked_text_split_list_series = turker_input_df['masked_text'].apply(lambda x: x.split('<mask>'))

# Create dataframe that is just the texts 
all_text_df = pd.DataFrame(masked_text_split_list_series.tolist(), index=masked_text_split_list_series.index)

for chosen_word_number in range(MAX_QUESTIONS_BY_N):

    turker_input_df['text_for_word' + str(chosen_word_number+1)] = all_text_df[chosen_word_number]
    if chosen_word_number == (MAX_QUESTIONS_BY_N-1):
        turker_input_df['text_for_wordfinal'] = all_text_df[chosen_word_number + 1]

# Fill empty rows with empty spaces 
turker_input_df = turker_input_df.fillna('')

turker_input_df.to_csv('Cloth_Text_nCloze_Distractors.csv', index=False)