In [13]:
import numpy as np
import torch
import pandas as pd
from transformers import PreTrainedTokenizerFast
import re
import spacy
nlp = spacy.load("en_core_web_sm")

In [14]:
tokenizer_bert = PreTrainedTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True,return_offsets_mapping = True, max_length=512,truncate=True,add_special_tokens=False,return_token_type_ids=False,return_attention_mask=False)

In [15]:
vocab_sorted = {k: v for k, v in sorted(tokenizer_bert.vocab.items(), key=lambda item: item[1])}

## Picking adjectives

In [16]:
words=[]
for item in vocab_sorted.items():
    if re.match('[a-z]{2,}$',item[0]):
        words.append(item[0])
len(words)

21719

In [17]:
nouns = []
adjs = []
for ix,word in enumerate(words):
    if nlp(word)[0].pos_ == 'NOUN' and len(nouns) < 1000:
        nouns.append(nlp(word)[0].text)
    elif nlp(word)[0].pos_ == 'ADJ' and len(adjs) < 2000:
        adjs.append(nlp(word)[0].text)

## Finding gradable adjectives

In [18]:
from collections import defaultdict
import textacy
import textacy.datasets
cw = textacy.datasets.CapitolWords()
cw.download()

In [19]:
adjectives_encountered = []
unique_adjectives_encountered = set()

In [20]:
for text,record in cw.records():
    processed = nlp(text)
    
    adjectives_encountered += [token for token in processed if token.text in adjs]
    
    for token in processed:
        if token.text in adjs:
            unique_adjectives_encountered |= set([token.text])

In [21]:
len(adjectives_encountered),len(unique_adjectives_encountered)

(312797, 1459)

In [22]:
gradable = defaultdict(int)
non_gradable = defaultdict(int)

In [23]:
modifiers = ['somewhat','very','really','extremely','rather']

In [24]:
for adj in adjectives_encountered:
    if len([x for x in adj.children if x.text in modifiers])>0:
        gradable[adj.text] += 1
    else:
        non_gradable[adj.text]+=1

In [25]:
combined = defaultdict(list)

In [26]:
for adj in unique_adjectives_encountered:
    toAdd = []

    toAdd.append(gradable[adj])
    toAdd.append(non_gradable[adj])
    combined[adj] = toAdd

In [34]:
adjs = defaultdict(list)
for adj in combined:
    occurences = sum(combined[adj])
    gradability_score = round(float((combined[adj][0])/occurences) * 100, 3)
    if occurences > 100 and gradability_score > 0.5:
        adjs[adj] = gradability_score

In [39]:
adjs = [k for k, v in sorted(adjs.items(), key=lambda item: item[1],reverse=True)][:200]

In [684]:
with open('gradable_adjectives.txt', 'w') as f:
    for item in adjs:
        f.write("%s\n" % item)

## Generating sentences

In [41]:
sentences = []
for noun in nouns:
    for adj in adjs:
        sentences.append('The '+noun+' is '+adj+'.')
        sentences.append('The '+noun+' are '+adj+'.')

In [42]:
len(sentences)

400000

In [43]:
sentences[:10]

['The time is simple.',
 'The time are simple.',
 'The time is disappointed.',
 'The time are disappointed.',
 'The time is difficult.',
 'The time are difficult.',
 'The time is helpful.',
 'The time are helpful.',
 'The time is brief.',
 'The time are brief.']

## Filtering by GPT perplexity

In [50]:
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
device = torch.device('cuda:0')
model_id = 'gpt2'
model_gpt = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer_gpt = GPT2Tokenizer.from_pretrained(model_id)

In [51]:
def process_gpt(sentence):
    tokens = ["[CLS]"] + tokenizer_gpt.tokenize(sentence)
    tokens_ids = tokenizer_gpt.convert_tokens_to_ids(tokens)
    tokens_ids = torch.tensor([tokens_ids,], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model_gpt(tokens_ids, lm_labels=tokens_ids)
        log_likelihood = outputs.item()
    return np.exp(log_likelihood) 

In [52]:
pairs = {}
for sentence in sentences:
    pairs[sentence] = process_gpt(sentence)

In [53]:
df = pd.DataFrame.from_dict(pairs, orient='index').reset_index()
df = df.rename(columns={"index": "sentence", 0: "perplexity"})
df.sort_values(by='perplexity', ascending=True)

Unnamed: 0,sentence,perplexity
158400,The reason is simple.,32.092901
216000,The answer is simple.,33.160138
230401,The rules are simple.,35.773233
126000,The plan is simple.,36.188385
102800,The idea is simple.,37.175624
...,...,...
36899,The wasn are richest.,18833.300436
116693,The wouldn are rural.,19602.368611
37081,The wasn are junior.,20574.691978
37093,The wasn are rural.,21252.049206


In [699]:
df.sort_values(by='perplexity', ascending=True).to_csv('/home/lisa/hobbies/modifiers_all.csv')
df.sort_values(by='perplexity', ascending=True).head(10000).to_csv('/home/lisa/hobbies/modifiers_top10k.csv')

## Producing negative vs positive sentence pairs

In [54]:
ten_k = df.sort_values(by='perplexity', ascending=True).head(10000)

In [55]:
pos_10k = []
neg_10k = []
for sentence in ten_k['sentence'].values:
    words = sentence.split(' ')
    aff = ' '.join(words[:3]+['[MASK]']+words[3:])    
    if words[2] == 'is':
        neg = ' '.join(words[:2]+["isn't [MASK]"]+words[3:])
    else:
        neg = ' '.join(words[:2]+["aren't [MASK]"]+words[3:])
    pos_10k.append(aff)
    neg_10k.append(neg)

In [None]:
with open('10k_aff.txt', 'a') as f:
    for sentence in pos_10k:
        f.write(sentence+'\n')
        
with open('10k_neg.txt', 'a') as f:
    for sentence in neg_10k:
        f.write(sentence+'\n')