# Recursive Ekphrasis Gym

## Processing `(adj,noun)` pairs

In [1]:
#! wget https://github.com/kbooten/ekphrasisgym/blob/main/adj2nouns_tuples_with_count.json?raw=true

In [2]:
import json

with open('adj2nouns_tuples_with_count.json','r') as f:
  adj2noun_tuples_with_count = json.load(f)

In [3]:
adj2noun_tuples_with_count[:4]

[[['instant', 'disaster', 1], 5],
 [['high', 'heaven', 1], 277],
 [['good', 'speed', 1], 596],
 [['wild', 'boars', 2], 433]]

In [4]:
[n for n,c in adj2noun_tuples_with_count if n[1]=="sherbet"]#.count(1)

[['little', 'sherbet', 1],
 ['iced', 'sherbet', 1],
 ['rich', 'sherbet', 1],
 ['exquisite', 'sherbet', 1],
 ['excellent', 'sherbet', 1],
 ['turkish', 'sherbet', 1],
 ['frozen', 'sherbet', 1],
 ['nice', 'sherbet', 1],
 ['pink', 'sherbet', 1],
 ['cool', 'sherbet', 1],
 ['drunk', 'sherbet', 1]]

Using WordNet to make sure a token is a word.

In [5]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/kyle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kyle/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
wn.synsets("sherbet",pos=wn.NOUN)

[Synset('sherbert.n.01')]

In [7]:
from collections import defaultdict

noun2adjs = defaultdict(list)

In [8]:
min_count = 4

for tup,count in adj2noun_tuples_with_count:
  adj,noun,num = tup
  if count>=min_count: ### don't add very rare words
    if (adj.isalpha() and noun.isalpha() and adj.islower() and noun.islower()):
      if wn.synsets(noun,pos=wn.NOUN)!=[]:
        noun_plus_num = noun+"~"+str(num)
        noun2adjs[noun_plus_num]+=[adj]*count

Get rid of values that are the same as the key.

In [9]:
for key,values in noun2adjs.items():
  noun2adjs[key] = [v for v in values if v.split("~")[0]!=key]

In [10]:
noun2adjs["pig~1"][:3]

['hungry', 'hungry', 'hungry']

Sometimes a value is just a letter.  Filter out really short words.

In [11]:
for key,values in noun2adjs.items():
  noun2adjs[key] = [v for v in values if len(v)>2]

### Rank by TF-IDF

In [12]:
sets_of_words = list(noun2adjs.values())

In [13]:
total_number_of_sets = len(sets_of_words)
total_number_of_sets

25092

In [14]:
from collections import defaultdict

word2doc_count = defaultdict(int)

In [15]:
for s in sets_of_words:
  s = list(set(s))
  for t in s:
    word2doc_count[t]+=1

In [16]:
word2doc_count['eat']

0

In [17]:
noun2adjs_and_weights = {}

In [18]:
for key,words in noun2adjs.items():
  if len(words)!=0: ## no empty sets
    unique_words = list(set(words))
    adjs_and_weights = []
    for w in unique_words:
      tf = words.count(w)/len(words)
      idf = total_number_of_sets/word2doc_count[w]
      tfidf = tf * idf
      adjs_and_weights.append((w,tfidf))
    adjs_and_weights.sort(key=lambda x:x[1], reverse=True)
    noun2adjs_and_weights[key]=adjs_and_weights[:20]

Make dictionary more concise.

```
dict[key] = {
  "num":1, ## or 2
  "adjs":[("fulsome",11.45),...]
}
```

First, only keep singular *or* plural of noun, whichever has more results (more likely to be correct).

In [19]:
all_keys_without_number = list(set([n.split("~")[0] for n in noun2adjs_and_weights]))

In [20]:
for k in all_keys_without_number:
  if ((k+"~1" in noun2adjs_and_weights) and (k+"~2" in noun2adjs_and_weights)):
    if len(noun2adjs_and_weights[k+"~1"])>len(noun2adjs_and_weights[k+"~2"]):
      del noun2adjs_and_weights[k+"~2"]
    else:
      del noun2adjs_and_weights[k+"~1"]

In [21]:
noun2adjs_and_weights_reformatted = {}

for key,values in noun2adjs_and_weights.items():
  # print(key)
  # print(key.split("~"))
  key,num = key.split("~")
  noun2adjs_and_weights_reformatted[key]={}
  noun2adjs_and_weights_reformatted[key]["num"]=num
  noun2adjs_and_weights_reformatted[key]["verbs"]=values

In [22]:
noun2adjs_and_weights_reformatted["laugh"]

{'num': '1',
 'verbs': [('mirthless', 18.81848862802641),
  ('unforced', 3.6818782098312544),
  ('forced', 3.4235007915974824),
  ('hearty', 3.3002829892044865),
  ('hysteric', 3.0682318415260457),
  ('sneering', 3.0572738706634524),
  ('rippling', 2.761408657373441),
  ('unmirthful', 2.761408657373441),
  ('ringing', 2.680665714175387),
  ('grating', 2.09197625558594),
  ('vexed', 2.0710564930300808),
  ('derisive', 2.0016560109003247),
  ('scornful', 1.9082905355832722),
  ('screechy', 1.8409391049156272),
  ('unconvincing', 1.8409391049156272),
  ('embarrassed', 1.678503301540719),
  ('loud', 1.5284741931902024),
  ('hysterical', 1.3716801173881144),
  ('gleeful', 1.3648341639891721),
  ('merry', 1.3386096845194424)]}

In [23]:
with open('../noun2adjs_and_weights.json','w') as f:
  json.dump(noun2adjs_and_weights_reformatted,f)

In [24]:
# from google.colab import files
# files.download('/content/noun2adjs_and_weights.json')

***