In [None]:
# Author: Tony Dong
# Date: 11/09/2020

# Environmental Setup

In [None]:
!git clone https://github.com/huggingface/neuralcoref.git

!pip install -U spacy
!python -m spacy download en
!pip install -r neuralcoref/requirements.txt
!pip install -e neuralcoref/

In [None]:
!pip uninstall spacy
!pip install -U spacy

In [4]:
import neuralcoref
import spacy

In [238]:
nlp = spacy.load('en_core_web_sm')

# Do not resolve 1st/2nd person pronouns
neuralcoref.add_to_pipe(nlp, blacklist=True, greedyness=0.5)

<spacy.lang.en.English at 0x7fe753d630b8>

In [239]:
# Example

doc1 = nlp('My sister has a dog. She loves her dog.')
print(doc1._.coref_clusters)
print(doc1._.coref_resolved)

[My sister: [My sister, She, her], a dog: [a dog, her dog]]
My sister has a dog. My sister loves a dog.


# Function - read_poem_list

In [204]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [205]:
def read_poem_list(filepath):
    with open(filepath, "r") as f:
        text = f.read()
    # Get rid of the "SAMPLE N"
    poem_samples = text.split("\n", 1)
    poem_samples = poem_samples[1].split("\n<|endoftext|>\n")

    # Split each line before tokenizing
    poems = [poem.replace("|$|", "\n") for poem in poem_samples]
    poems_lines = [poem.split("\n") for poem in poems]

    poems_lines_tokens = [[nltk.word_tokenize(line) for line in poem] for poem in poems_lines]
    poems = [" \n ".join([" ".join(line) for line in poem]) for poem in poems_lines_tokens]

    # Ensure that every token is surrounded by white spaces
    poems = [(" " + poem.replace("``", "''") + " ") for poem in poems]
    return poems

In [222]:
# Example

pm = read_poem_list("run1/samples-90000")
# print(len(pm))
# print(pm)
for i in pm:
    print(i)
    print()

 glue ? 
 they 're an x in their groove see with ease 
 the x 's a dixota 
 if you 'll guess what has meant 

 the blue of a cereal brew 
 is not always a lot of forchy 
 i can call it real blue 
 or it 's blue just as '' blue '' 

 when the blood pressure 's falling i think 
 that the latter will soon cease to stink 
 as i 'm feeling too good 
 for the pressure on blood 

 when a thing that 's essential is banned 
 it 's confusing and not in demand 
 it 's not even the facts 
 it 's the '' bosom of fox '' 

 that is nothing but being sarcastic 
 you get angry and rage but beware a 
 big bang ( not quite steady ) 
 which is causing you anguish 

 '' she 's a lady a no-no '' he said 
 since some others would call up instead 
 '' this is good '' i said '' no ! 
 she sure stands in the buff ! '' 

 as you sit drooping near the bedlamite 
 please ignore looking up in a fright 
 at misfortune you start ; 
 it 's your own pan of heart 

 '' i 'll be hugging and suck on the gas '' 
 said my b

# Thoughts

Problems:
1. Not good at resolving 2nd person coreference
2. Fewer coreference != Less coherent
3. Formal subject "it" (How to solve???)
4. Consider "this" "that" "these" "those" as pronouns?

Scoring:

**Range: $({-\infty}, 0]$**
1. More coreference occurences -> x
2. More objects -> x
3. **Pronouns without coreference -> As few as possible**

Solutions:
1. Use greedyness=1 to find all objects? -> x
2. Manually resolve 2nd person pronouns? -> x
3. **Remove all resolved coreferences, search the rest of the poem for any unresolved 3rd person pronouns**

In [223]:
# Example

poem_coref = [nlp(i) for i in pm]
for i in poem_coref:
    print(i._.coref_clusters)

[they: [they, their]]
[ the blue of a cereal brew 
 : [ the blue of a cereal brew 
 , it, it]]
[]
[it: [it, it]]
[]
[she: [she, she]]
[]
[my lover: [my lover, he]]
[we: [we, our]]
[]
[]
[]
[a vacation 
 of the building folks: [a vacation 
 of the building folks, they]]
[]
[]
[]
[]
[]
[our: [our, we, we, our]]
[our: [our, we, we, we, we ', we]]
[ my baby: [ my baby, my baby, he]]
[ my mother: [ my mother, she]]
[my kid cousin: [my kid cousin, he]]


In [208]:
# Example

print(i._.coref_clusters)
resolved = []
for j in i._.coref_clusters:
    resolved += [j.mentions[k].text for k in range(len(j.mentions))]
print(resolved)

[my kid cousin: [my kid cousin, he]]
['my kid cousin', 'he']


# Function - coref_score

In [209]:
import re

In [240]:
def coref_score(poem):
    origin_poem = poem
    resolved = []
    poem_coref = nlp(poem)
    for i in poem_coref._.coref_clusters:
        resolved += [i.mentions[j].text for j in range(len(i.mentions))]
    resolved = [(" " + token + " ") for token in resolved]

    # Remove resolved coreference
    for k in resolved:
        poem = poem.replace(k, " ", 1)
    
    # Regex pattern matching with lookahead
    # All 3rd person pronouns
    pronoun_3p = r'(?=(( he )|( him )|( his )|( himself )|( she )|( her )|( hers )|( herself )|( it )|( its )|( itself )|( they )|( them )|( their )|( theirs )|( themself )|( themselves )))'

    # Find remaining unresolved 3rd person pronouns
    matches_3p = re.findall(pronoun_3p, poem)


    # Non-positive score, maximum at 0
    score = -len(matches_3p)

    if (VERBOSE):
        if score < 0:
            for n in matches_3p:
                print(n[0])
            print(origin_poem)
            print(poem)

    return score

# Function - test_coref

In [245]:
def test_coref(filepath):
    poems = read_poem_list(filepath)
    for pi, p in enumerate(poems):
        print("# ({})".format(pi), end="\t")
        print("Coref score = {}".format(coref_score(p)))

In [246]:
# Example

VERBOSE = False
test_coref("run1/samples-90000")

# (0)	Coref score = 0
# (1)	Coref score = 0
# (2)	Coref score = 0
# (3)	Coref score = -1
# (4)	Coref score = 0
# (5)	Coref score = -1
# (6)	Coref score = -1
# (7)	Coref score = 0
# (8)	Coref score = 0
# (9)	Coref score = -1
# (10)	Coref score = -1
# (11)	Coref score = 0
# (12)	Coref score = 0
# (13)	Coref score = 0
# (14)	Coref score = 0
# (15)	Coref score = 0
# (16)	Coref score = 0
# (17)	Coref score = 0
# (18)	Coref score = 0
# (19)	Coref score = 0
# (20)	Coref score = 0
# (21)	Coref score = 0
# (22)	Coref score = 0


In [247]:
# Example

VERBOSE = True
test_coref("run1/samples-90000")

# (0)	Coref score = 0
# (1)	Coref score = 0
# (2)	Coref score = 0
# (3)	 it 
 when a thing that 's essential is banned 
 it 's confusing and not in demand 
 it 's not even the facts 
 it 's the '' bosom of fox '' 
 when a thing that 's essential is banned 
 's confusing and not in demand 
 's not even the facts 
 it 's the '' bosom of fox '' 
Coref score = -1
# (4)	Coref score = 0
# (5)	 he 
 '' she 's a lady a no-no '' he said 
 since some others would call up instead 
 '' this is good '' i said '' no ! 
 she sure stands in the buff ! '' 
 '' 's a lady a no-no '' he said 
 since some others would call up instead 
 '' this is good '' i said '' no ! 
 sure stands in the buff ! '' 
Coref score = -1
# (6)	 it 
 as you sit drooping near the bedlamite 
 please ignore looking up in a fright 
 at misfortune you start ; 
 it 's your own pan of heart 
 as you sit drooping near the bedlamite 
 please ignore looking up in a fright 
 at misfortune you start ; 
 it 's your own pan of heart 
Coref s

# Summary

Remaining problems:
3. Formal subject "it" (How to solve???)
4. Consider "this" "that" "these" "those" as pronouns?

Possible Solutions:
- Assign different penalty for different pronouns?
(Ex: "it" -0.2, "he"/"she"/"they" -1.0, "these" -0.5, etc.)
(How to estimate these values from language model probabilities?)