# load data 

In [1]:
def splitTolenWords(text, nwords):
    words = text.split(" ")
    count = int(len(words)/nwords)
    sentences = [" ".join(words[(i*nwords):((i+1)*nwords)]) for i in range(count)]
    return(sentences)

In [2]:
with open("data_splits/trumpGeneratorTextTrain.txt", encoding = "utf-8") as f:
    gentrain = f.read()
    
with open("data_splits/trumpGeneratorTextValid.txt", encoding = "utf-8") as f:
    genvalid = splitTolenWords(f.read(), 100)

# load gpt-2 tokenizer 

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokens = tokenizer(gentrain)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (351805 > 1024). Running this sequence through the model will result in indexing errors


# create token - word relations 

In [4]:
import pandas as pd

def formdb(tokens, ngrams):
    grams = pd.DataFrame()
    n = 2
    for i in range(len(tokens) - n):
        i = tokens[i:i+n]

    grams = [tokens[i:i+n] for i in range(len(tokens) - n)]
    grams_mem = pd.DataFrame(grams)
    return grams_mem

In [5]:
grams_mem = formdb(tokens, 2)

# create function to sample sequences 

In [6]:
import numpy as np

def filter_db(db, tokens, iteration = 0):
    n = db.shape[1]
    tokens = tokens[-n:]
    filt = db
    for i in range(n-1 - iteration):
        filt= filt[filt[i+iteration]==tokens[i+iteration]]
    return(filt)

def temperature(dat, temp = 1):
    res = np.exp(dat*temp)
    return res/res.sum()

def top_k(dat, k = 0):
    if k == 0: 
        return dat
    else:
        return dat[:(k)]
    
def top_p(dattemp, p = 1):
    sump = 0
    #print(len(dattemp))
    for i in range(len(dattemp)):
        sump += dattemp.iloc[i]
        if sump > p:
            break

    res = dattemp[:(i+1)]
    res = res/sum(res)
    return res


def sample(tokens, db, temp = 1, k = 0, p = 1):
    n = db.shape[1]
    for i in range(n-1):
        dat = filter_db(db, tokens, i)[n-1].value_counts(normalize = True)
        if len(dat) != 0 :
            dat = temperature(dat,temp)  
            dat = top_k(dat,k)
            dat = top_p(dat,p)
            word = dat.to_frame().sample(weights = dat.to_frame()[n-1]).index[0]
            break
    if len(dat) == 0:
        dat = db[n-1].value_counts(normalize = True)
        #print(dat)
        word = dat.to_frame().sample(weights = dat.to_frame()[n-1]).index[0]
        #print(word)
    return(word)

In [7]:
def generator(db, ntokens, context_str, temp = 0.7, k = 5, p = 0.6):
    ls = tokenizer(context_str)["input_ids"]
    for i in range(ntokens):
        ls.append(sample(ls, db, temp = temp, k = k, p = p))
    return(tokenizer.decode(ls))

In [8]:
context = "something just happened"
len(generator(grams_mem, 130, context).split(" "))

108

# load metrics 

In [9]:
import spacy
import en_core_web_sm
from nltk.translate.bleu_score import sentence_bleu
from scipy.stats import ttest_ind

def unnameTextWithM(text):
    nlp = en_core_web_sm.load()
    doc = nlp(text)
    text = text
    for ent in reversed(doc.ents):
        text = text[:ent.start_char] + "<M>" + text[ent.end_char:] 
    return text

def bleu_scores(string, valid_data):
    valid = [i.split(" ") for i in valid_data]
    string = string.split(" ")[:100]
    bleu1 = sentence_bleu(valid, string ,weights = (0.25,0.25,0.25,0.25))
    return (bleu1)


In [10]:
import random
def randcontext(text,lensen):
    gentrainwords = text.split(" ")
    ran = len(gentrainwords)-lensen
    x = random.randint(0,ran)
    text = " ".join(gentrainwords[x:(x+lensen)])
    return text

In [11]:
def evaluate(genvalid, genstr):
    strM = unnameTextWithM(genstr)
    b = bleu_scores(strM, genvalid)
    return (b, )

# gridsearch params on validation dataset

In [12]:
import tqdm

In [13]:
ngrams = [2,3,4,5]
temp = [0.2,0.4,0.6,0.8,1]
k = [5,10,20,50]
p = [0.2,0.6,0.8,1.0]

results = []

bar = tqdm.tqdm(total = len(ngrams)* len(temp) * len(k) * len(p))
for n in ngrams:
    for t in temp:
        for ki in k:
            for pi in p:
                #print(n,t,ki,pi)
                grams_mem = formdb(tokens, n)
                for i in range(10):
                    context = randcontext(gentrain,4)
                    strM = unnameTextWithM(generator(grams_mem, 130, context, temp = t, k = ki, p = pi))
                    b1 = bleu_scores(strM, genvalid)
                    results.append((((n,t,ki,pi),)+(b1,)))
                bar.update(1)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 320/320 [1:13:15<00:00, 13.36s/it]

In [14]:
results

[((2, 0.2, 5, 0.2), 2.422233260848818e-78),
 ((2, 0.2, 5, 0.2), 2.9733476227162733e-78),
 ((2, 0.2, 5, 0.2), 2.1745627492888135e-78),
 ((2, 0.2, 5, 0.2), 7.647640113249902e-155),
 ((2, 0.2, 5, 0.2), 0.06969152894039435),
 ((2, 0.2, 5, 0.2), 2.715237836048041e-78),
 ((2, 0.2, 5, 0.2), 0.09438911178038879),
 ((2, 0.2, 5, 0.2), 8.170202920075766e-155),
 ((2, 0.2, 5, 0.2), 0.07266624295241954),
 ((2, 0.2, 5, 0.2), 7.928119541282821e-155),
 ((2, 0.2, 5, 0.6), 0.08692120729941627),
 ((2, 0.2, 5, 0.6), 0.08017210986150819),
 ((2, 0.2, 5, 0.6), 0.0772649474264478),
 ((2, 0.2, 5, 0.6), 9.562522836930583e-155),
 ((2, 0.2, 5, 0.6), 2.318533773622696e-78),
 ((2, 0.2, 5, 0.6), 2.8020643606258e-78),
 ((2, 0.2, 5, 0.6), 0.07285932254387488),
 ((2, 0.2, 5, 0.6), 2.2183842328187483e-78),
 ((2, 0.2, 5, 0.6), 2.2832337628940824e-78),
 ((2, 0.2, 5, 0.6), 8.00233779990827e-155),
 ((2, 0.2, 5, 0.8), 1.9449880514994073e-78),
 ((2, 0.2, 5, 0.8), 2.871377811812501e-78),
 ((2, 0.2, 5, 0.8), 1.9813587107196492e-

In [15]:
resultspd = pd.DataFrame(results)

In [17]:
results = resultspd.groupby([0]).mean().reset_index()
results[results[1] == max(results[1])]

Unnamed: 0,0,1
176,"(4, 0.4, 5, 0.2)",0.058539


## generate test samples

In [18]:
def save_to_file(string, path):
    with open(path, 'w', encoding = "utf-8") as output_file:
        output_file.write(string)

In [22]:
grams_mem = formdb(tokens, 4)
samples = []
path = "generators/Ngram_texts/sampling_texts/"
filename = "4gram" 
for i in range(100): 
    context = randcontext(gentrain,4)
    string = generator(grams_mem, 130, context, temp = 0.4, k = 5, p = 0.2)
    save_path = path + filename + " " +  str(i)+ " " + ".txt"
    save_to_file(string, save_path)