# Recursive Ekphrasis Gym

## Processing `(possessor,possessed)` pairs

In [1]:
! wget https://raw.githubusercontent.com/kbooten/ekphrasisgym/main/possessor2possessed_tuples_with_count.json

--2022-08-12 18:59:31--  https://raw.githubusercontent.com/kbooten/ekphrasisgym/main/possessor2possessed_tuples_with_count.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10705892 (10M) [text/plain]
Saving to: ‘possessor2possessed_tuples_with_count.json’


2022-08-12 18:59:31 (158 MB/s) - ‘possessor2possessed_tuples_with_count.json’ saved [10705892/10705892]



In [2]:
import json

with open('possessor2possessed_tuples_with_count.json','r') as f:
  possessor2possessed_tuples_with_count = json.load(f)

In [3]:
possessor2possessed_tuples_with_count[:4]

[[['strength', 'abundance'], 3],
 [['honour', 'ring'], 2],
 [['tongue', 'tune'], 2],
 [['father', 'skill'], 16]]

Using WordNet to make sure a token is a word.

In [4]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [5]:
wn.synsets("hat",pos=wn.NOUN)

[Synset('hat.n.01'), Synset('hat.n.02')]

In [6]:
from collections import defaultdict

possessor2possessed = defaultdict(list)

In [7]:
for tup,count in possessor2possessed_tuples_with_count:
  possessor,possessed = tup
  if wn.synsets(possessed,pos=wn.NOUN)!=[]:
    possessor2possessed[possessor]+=[possessed]*count

In [8]:
possessor2possessed["tongue"][:5]

['tune', 'tune', 'tip', 'tip', 'tip']

In [9]:
possessor2possessed["sake"][:3]

['sake', 'sake', 'sake']

Get rid of values that are the same as the key.

In [10]:
for key,values in possessor2possessed.items():
  possessor2possessed[key] = [v for v in values if v!=key]

In [11]:
possessor2possessed["sake"][:3]

['cause', 'return']

Sometimes a value is just a letter.  Filter out really short words.

In [12]:
for key,values in possessor2possessed.items():
  possessor2possessed[key] = [v for v in values if len(v)>2]

### Rank by TF-IDF

In [13]:
sets_of_words = list(possessor2possessed.values())

In [14]:
total_number_of_sets = len(sets_of_words)
total_number_of_sets

21450

In [15]:
from collections import defaultdict

word2doc_count = defaultdict(int)

In [16]:
for s in sets_of_words:
  s = list(set(s))
  for t in s:
    word2doc_count[t]+=1

In [17]:
word2doc_count['friend']

392

In [18]:
possessor2possessed_and_weights = {}

In [19]:
for key,words in possessor2possessed.items():
  if len(words)!=0: ## no empty sets
    unique_words = list(set(words))
    possessed_and_weights = []
    for w in unique_words:
      tf = words.count(w)/len(words)
      idf = total_number_of_sets/word2doc_count[w]
      tfidf = tf * idf
      possessed_and_weights.append((w,tfidf))
    possessor2possessed_and_weights[key]=possessed_and_weights

In [20]:
possessor2possessed_and_weights['wolf']

[('turn', 0.12324044814708418),
 ('head', 1.4468713973852683),
 ('woes', 0.5508474576271186),
 ('lock', 1.136122881355932),
 ('lust', 0.52689756816507),
 ('ears', 0.13390766925742112),
 ('cub', 14.138418079096047),
 ('memory', 0.14600775985297118),
 ('track', 0.5026165742404124),
 ('hide', 1.2536528345996494),
 ('fangs', 7.609381158849034),
 ('movements', 0.14659650082012027),
 ('life', 0.03308092102219275),
 ('association', 0.4544491525423729),
 ('cry', 0.5607598797438019),
 ('face', 0.12694110406211534),
 ('surprise', 0.1903451947821457),
 ('brushes', 1.8177966101694916),
 ('son', 0.16779661016949152),
 ('tramp', 0.8867300537412153),
 ('teat', 15.581113801452783),
 ('milk', 0.6110240706452073),
 ('muzzle', 4.03954802259887),
 ('need', 0.2164043583535109),
 ('breath', 0.10661563696008748),
 ('foot', 0.17821535393818544),
 ('breasts', 0.7271186440677966),
 ('habits', 0.21901163977945684),
 ('ribs', 0.5386064030131826),
 ('dictionary', 2.1385842472582253),
 ('path', 0.1224105461393597),

In [21]:
with open('possessor2possessed_and_weights.json','w') as f:
  json.dump(possessor2possessed_and_weights,f)

In [22]:
from google.colab import files
files.download('/content/possessor2possessed_and_weights.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

***