# Recursive Ekphrasis Gym

## Processing `(noun,verb)` pairs

In [1]:
#! wget https://github.com/kbooten/ekphrasisgym/raw/main/noun2verb_tuples_with_count.json

In [2]:
import json

with open('noun2verb_tuples_with_count.json','r') as f:
  noun2verb_tuples_with_count = json.load(f)

In [3]:
noun2verb_tuples_with_count[:4]

[[['body', 'require', 1], 69],
 [['art', 'ransom', 1], 2],
 [['eyes', 'teach', 2], 14],
 [['time', 'bring on', 1], 6]]

In [4]:
[n for n,c in noun2verb_tuples_with_count if n[0]=="mouse"]#.count(1)

[['mouse', 'have', 1],
 ['mouse', 'tread', 1],
 ['mouse', 'eat', 1],
 ['mouse', 'give', 1],
 ['mouse', 'begin', 1],
 ['mouse', 'find', 1],
 ['mouse', 'marry', 1],
 ['mouse', 'repay', 1],
 ['mouse', 'see', 1],
 ['mouse', 'build', 1],
 ['mouse', 'worry', 1],
 ['mouse', 'poke', 1],
 ['mouse', 'transform', 1],
 ['mouse', 'fret', 1],
 ['mouse', 'think', 1],
 ['mouse', 'make', 1],
 ['mouse', 'help', 1],
 ['mouse', 'deserve', 1],
 ['mouse', 'show', 1],
 ['mouse', 'take', 1],
 ['mouse', 'say', 1],
 ['mouse', 'leave', 1],
 ['mouse', 'bring', 1],
 ['mouse', 'enter', 1],
 ['mouse', 'feast', 1],
 ['mouse', 'prefer', 1],
 ['mouse', 'recollect', 1],
 ['mouse', 'sniff', 1],
 ['mouse', 'publish', 1],
 ['mouse', 'answer', 1],
 ['mouse', 'consume', 1],
 ['mouse', 'lead', 1],
 ['mouse', 'gnaw', 1],
 ['mouse', 'visit', 1],
 ['mouse', 'run up', 1],
 ['mouse', 'suspend', 1],
 ['mouse', 'nibble', 1],
 ['mouse', 'notice', 1],
 ['mouse', 'recommend', 1],
 ['mouse', 'hear', 1],
 ['mouse', 'gird up', 1],
 ['mous

Using WordNet to make sure a token is a word.

In [5]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/kyle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kyle/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
wn.synsets("wiggle",pos=wn.VERB)

[Synset('jiggle.v.01')]

In [7]:
from collections import defaultdict

noun2verbs = defaultdict(list)

In [8]:
min_count = 4

for tup,count in noun2verb_tuples_with_count:
  noun,verb,num = tup
  verbtosearch = verb.split(" ")[0]
  if count>=min_count: ### don't add very rare words
    if (noun.isalpha() and verbtosearch.isalpha() and noun.islower() and verbtosearch.islower()):
      if wn.synsets(verbtosearch,pos=wn.VERB)!=[]:
        noun_plus_num = noun+"~"+str(num)
        noun2verbs[noun_plus_num]+=[verb]*count

Get rid of values that are the same as the key.

In [9]:
for key,values in noun2verbs.items():
  noun2verbs[key] = [v for v in values if v.split("~")[0]!=key]

In [10]:
noun2verbs["pig~1"][:3]

['eat', 'eat', 'eat']

Sometimes a value is just a letter.  Filter out really short words.

In [11]:
for key,values in noun2verbs.items():
  noun2verbs[key] = [v for v in values if len(v)>2]

### Rank by TF-IDF

In [12]:
sets_of_words = list(noun2verbs.values())

In [13]:
total_number_of_sets = len(sets_of_words)
total_number_of_sets

12714

In [14]:
from collections import defaultdict

word2doc_count = defaultdict(int)

In [15]:
for s in sets_of_words:
  s = list(set(s))
  for t in s:
    word2doc_count[t]+=1

In [16]:
word2doc_count['eat']

369

In [17]:
noun2verbs_and_weights = {}

In [18]:
for key,words in noun2verbs.items():
  if len(words)!=0: ## no empty sets
    unique_words = list(set(words))
    verbs_and_weights = []
    for w in unique_words:
      tf = words.count(w)/len(words)
      idf = total_number_of_sets/word2doc_count[w]
      tfidf = tf * idf
      verbs_and_weights.append((w,tfidf))
    verbs_and_weights.sort(key=lambda x:x[1], reverse=True)
    noun2verbs_and_weights[key]=verbs_and_weights[:20]

Make dictionary more concise.

```
dict[key] = {
  "num":1, ## or 2
  "verbs":[("animadvert",11.45),...]
}
```

First, only keep singular *or* plural of noun, whichever has more results (more likely to be correct).

In [19]:
all_keys_without_number = list(set([n.split("~")[0] for n in noun2verbs_and_weights]))

In [20]:
for k in all_keys_without_number:
  if ((k+"~1" in noun2verbs_and_weights) and (k+"~2" in noun2verbs_and_weights)):
    if len(noun2verbs_and_weights[k+"~1"])>len(noun2verbs_and_weights[k+"~2"]):
      del noun2verbs_and_weights[k+"~2"]
    else:
      del noun2verbs_and_weights[k+"~1"]

In [21]:
noun2verbs_and_weights_reformatted = {}

for key,values in noun2verbs_and_weights.items():
  # print(key)
  # print(key.split("~"))
  key,num = key.split("~")
  noun2verbs_and_weights_reformatted[key]={}
  noun2verbs_and_weights_reformatted[key]["num"]=num
  noun2verbs_and_weights_reformatted[key]["verbs"]=values

In [22]:
noun2verbs_and_weights_reformatted["bell"]

{'num': '1',
 'verbs': [('toll', 140.55199550309163),
  ('clang out', 40.498032602585724),
  ('tinkle', 35.73355817875211),
  ('clang', 30.016188870151773),
  ('ring out', 29.68634064080944),
  ('sound out', 28.586846543001688),
  ('peal forth', 21.440134907251263),
  ('jangle', 21.440134907251263),
  ('peal out', 17.866779089376056),
  ('boom out', 17.152107925801012),
  ('toll out', 16.67566048341765),
  ('shrill', 14.293423271500844),
  ('ring', 14.13099800705197),
  ('peal', 9.188629245964828),
  ('summon', 6.856194902589835),
  ('chime', 5.717369308600338),
  ('sound', 5.410127499960599),
  ('boom', 4.466694772344014),
  ('send forth', 4.083835220428813),
  ('signal', 3.062876415321609)]}

In [23]:
with open('../noun2verbs_and_weights.json','w') as f:
  json.dump(noun2verbs_and_weights_reformatted,f)

In [24]:
# from google.colab import files
# files.download('/content/noun2verbs_and_weights.json')

***