# Recursive Ekphrasis Gym

## Processing `(possessor,possessed)` pairs

In [1]:
#! wget https://raw.githubusercontent.com/kbooten/ekphrasisgym/main/possessor2possessed_tuples_with_count.json

In [2]:
import json

with open('possessor2possessed_tuples_with_count.json','r') as f:
  possessor2possessed_tuples_with_count = json.load(f)

In [3]:
possessor2possessed_tuples_with_count[:4]

[[['mother', 'child'], 122],
 [['heart', 'workings'], 2],
 [['youth', 'livery'], 6],
 [['summer', 'flower'], 7]]

In [4]:
len(possessor2possessed_tuples_with_count)

475435

In [5]:
min_count = 2

possessor2possessed_tuples_with_count = [(pair,c) for pair,c in possessor2possessed_tuples_with_count if c>=min_count]

In [6]:
len(possessor2possessed_tuples_with_count)

141844

Using WordNet to make sure a token is a word.

In [7]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/kyle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kyle/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
wn.synsets("hat",pos=wn.NOUN)

[Synset('hat.n.01'), Synset('hat.n.02')]

In [9]:
from collections import defaultdict

possessor2possessed = defaultdict(list)

In [10]:
for tup,count in possessor2possessed_tuples_with_count:
  possessor,possessed = tup
  if wn.synsets(possessed,pos=wn.NOUN)!=[]:
    possessor2possessed[possessor]+=[possessed]*count

In [11]:
possessor2possessed["tongue"][:5]

['tune', 'tune', 'tip', 'tip', 'tip']

In [12]:
possessor2possessed["sake"][:3]

['sake', 'sake', 'sake']

Get rid of values that are the same as the key.

In [13]:
for key,values in possessor2possessed.items():
  possessor2possessed[key] = [v for v in values if v!=key]

In [14]:
possessor2possessed["sake"][:3]

[]

Sometimes a value is just a letter.  Filter out really short words.

In [15]:
for key,values in possessor2possessed.items():
  possessor2possessed[key] = [v for v in values if len(v)>2]

### Rank by TF-IDF

In [16]:
sets_of_words = list(possessor2possessed.values())

In [17]:
total_number_of_sets = len(sets_of_words)
total_number_of_sets

6883

In [18]:
from collections import defaultdict

word2doc_count = defaultdict(int)

In [19]:
for s in sets_of_words:
  s = list(set(s))
  for t in s:
    word2doc_count[t]+=1

In [20]:
word2doc_count['friend']

184

In [21]:
possessor2possessed_and_weights = {}

In [22]:
for key,words in possessor2possessed.items():
  if len(words)!=0: ## no empty sets
    unique_words = list(set(words))
    possessed_and_weights = []
    for w in unique_words:
      tf = words.count(w)/len(words)
      idf = total_number_of_sets/word2doc_count[w]
      tfidf = tf * idf
      possessed_and_weights.append((w,tfidf))
    possessed_and_weights.sort(key=lambda x:x[1], reverse=True)    
    possessor2possessed_and_weights[key]=possessed_and_weights[:30]

In [23]:
possessor2possessed_and_weights['wolf']

[('howl', 38.84585537918871),
 ('feeder', 21.85079365079365),
 ('whelp', 12.746296296296297),
 ('cub', 10.925396825396826),
 ('teat', 10.925396825396826),
 ('litter', 9.559722222222222),
 ('howling', 7.283597883597884),
 ('maw', 6.555238095238096),
 ('den', 6.155153141068634),
 ('fangs', 5.700207039337474),
 ('pelt', 5.462698412698413),
 ('jaws', 4.013411078717201),
 ('forefoot', 3.641798941798942),
 ('muzzle', 3.213352007469654),
 ('breed', 3.1215419501133788),
 ('paw', 2.6220952380952385),
 ('lair', 2.4278659611992945),
 ('skin', 2.4111220580186097),
 ('mouth', 1.8952218982831228),
 ('teeth', 1.7572316572316573),
 ('brush', 1.638809523809524),
 ('hide', 1.5333890281258702),
 ('belly', 1.4567195767195769),
 ('snout', 1.4567195767195766),
 ('bark', 1.3656746031746032),
 ('route', 1.3656746031746032),
 ('tooth', 1.2415223665223665),
 ('tail', 1.2368373764600178),
 ('bite', 1.2139329805996473),
 ('throat', 1.2075438596491226)]

In [24]:
with open('../possessor2possessed_and_weights.json','w') as f:
  json.dump(possessor2possessed_and_weights,f)

In [25]:
# from google.colab import files
# files.download('/content/possessor2possessed_and_weights.json')

***