## Expansion on Approach 2: "capture all the strings in quotes, but not shorter strings that are not really quotes, and the speaker of each quote."


In [2]:
# load all the stuff we'll need
import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [3]:
# load 5 text files 
with open ("data/5c1dbe1d1e67d78e2797d611.txt", "r", encoding='utf-8') as f:
    text1 = f.read()
with open ("data/5c1dccbf1e67d78e279807d8.txt", "r", encoding='utf-8') as f:
    text2 = f.read()
with open ("data/5c1de1661e67d78e27984d34.txt", "r", encoding='utf-8') as f:
    text3 = f.read()
with open ("data/5c1e0b68795bd2a5d03a49a9.txt", "r", encoding='utf-8') as f:
    text4 = f.read()
with open ("data/5c1efb3d1e67d78e279bd39a.txt", "r", encoding='utf-8') as f:
    text5 = f.read()

In [7]:
# Convert texts to Docs
doc1 = nlp(text1)
doc2 = nlp(text2)
doc3 = nlp(text3)
doc4 = nlp(text4)
doc5 = nlp(text5)

docList = [doc1, doc2, doc3, doc4, doc5]

# Show first few contents of each document
for doc in docList:
    print(doc[0:50])
    print("------")

 CTV Vancouver. 
 An Abbotsford, B.C. couple that has been waiting nearly two years to bring their newly adopted son home from Africa has learned that the Canadian government is not prepared to grant the child citizenship. 
 Kim and Clark Moran received a letter this
------
When I first walk into his store, a fantastical comic book shop lodged discreetly in Kitchener’s Frederick Mall, I spot him from the corner of my eye, the gracious gatekeeper who invites visitors into a world of fantasy and makes their dreams come true. No
------
OTTAWA — Conservative Leader Andrew Scheer says Prime Minister Justin Trudeau is the most divisive prime minister in the history of Canada. 
 Scheer is lashing out at Trudeau and the Liberal party for dismissing anyone who disagrees with them, particularly anyone who has questions about Canada’s
------
Open this photo in gallery A cyclist rides by a shuttered homes in the West Point Grey neighbourhood area of Vancouver on Oct. 29, 2015. Ben Nelms. 
 Last we

In [10]:
# Match quote to speaker
# Expansion: To capture speaker, make use of speech verbs

speech_verbs = ["told", "confirmed", "notes", "says", "said"]

matcher = Matcher(nlp.vocab)

pattern_test = [
    
    # Variation 1: "Quote" Speaker speechverb. 
    # Example: "The sky is blue." Speaker confirmed.
    [    
        # Quote
       {"ORTH": {"IN": ['"',"“"]}}, {"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "*"},
         # Speaker + speech verb.
         {"POS": "PROPN", "OP": "+"}, {"ORTH": {"IN": speech_verbs}}, {"IS_ALPHA": True, "OP": "*"}, {"ORTH": "."}],
    
    # Variation 2: "Quote" speechverb Speaker. 
    # Example: "The sky is blue." says Speaker.
    [

       {"ORTH": {"IN": ['"',"“"]}}, {"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "*"},
        # speech verb + speaker
         {"ORTH": {"IN": speech_verbs}}, {"IS_ALPHA": True, "OP": "*"}, {"IS_PUNCT": True, "OP": "*"}, {"IS_PUNCT": True, "OP": "*"}, {"POS": "PROPN", "OP": "+"}, {"ORTH": "."}
         ]
    
]


matcher.add("QUOTE_WITH_SPEAKERS", pattern_test, greedy="LONGEST")

matches1= matcher(doc1)
matches2= matcher(doc2)
matches3= matcher(doc3)
matches4= matcher(doc4)
matches5= matcher(doc5)

matchList = [matches1, matches2, matches3, matches4, matches5]
quoteSpeakerList = []

count = 0
# Run all five texts, and add all of the matches from them into a list.
for i in range(5):
    for match in matchList[count][:10]:
        quoteSpeakerList.append((docList[count][match[1]:match[2]]))
    count += 1


# Print out all of the quotes, including the speakers
for quotes in quoteSpeakerList:
  
    # Add a space in between the quotes found across different texts
    print(quotes)
    print("----------")



"If Canada doesnt grant him citizenship what happens They send him back to Nigeria to an orphanage They take him from us even though he is legally our son" Kim said.
----------
"Honestly it feels like we are living our worst nightmare right now" Kim told CTV News Friday.
----------
“If Alfonso wasnt publishing local comics I dont think Xander would be doing what hes doing.” says Wises dad Darrin.
----------
“Im a comic book creator myself.” notes Espinos.
----------
“Every publisher rejected me.” says Cambridge cartoonist Andre Campbell.
----------
“People who have those legitimate concerns deserve to be spoken to with respect and have their concerns taken into account not brushed aside with insulting labels,” Scheer said.
----------
“I actually think Justin Trudeaus approach to label people who have legitimate concerns with his issues as being unCanadian and intolerant that is very dangerous,” said Scheer.
----------
“Canadians clearly rejected Stephen Harpers divisive approach in the

### Isolating Quotes

In [12]:
matcherQuote = Matcher(nlp.vocab)
pattern_quote = [{"ORTH": {"IN": ['"',"“", '”']}}, {"IS_ALPHA": True, "OP": "+"}, {"IS_PUNCT": True, "OP": "*"}, {"ORTH": {"IN": ['"','“', '”']}}]
matcherQuote.add("QUOTES", [pattern_quote], greedy = "LONGEST")

quoteCount = 0

for i in range(len(quoteSpeakerList)):
    matchedQuote = matcherQuote(quoteSpeakerList[quoteCount])
    for match_id, start, end in matchedQuote:
        print(quoteSpeakerList[quoteCount][start:end])
        print("-----------")
    quoteCount += 1


"If Canada doesnt grant him citizenship what happens They send him back to Nigeria to an orphanage They take him from us even though he is legally our son"
-----------
"Honestly it feels like we are living our worst nightmare right now"
-----------
“If Alfonso wasnt publishing local comics I dont think Xander would be doing what hes doing.”
-----------
“Im a comic book creator myself.”
-----------
“Every publisher rejected me.”
-----------
“People who have those legitimate concerns deserve to be spoken to with respect and have their concerns taken into account not brushed aside with insulting labels,”
-----------
“I actually think Justin Trudeaus approach to label people who have legitimate concerns with his issues as being unCanadian and intolerant that is very dangerous,”
-----------
“Canadians clearly rejected Stephen Harpers divisive approach in the last election which is the same approach the Conservatives are relying on now,”
-----------
"Its a positive thing for the workers at S

### Isolating Speakers

In [13]:
# GOAL: Grab the speakers at the end of each quote
# Question: How can I grab just the name, but after the quote?
# How can I distinguish "CTV NEWS" as a non-speaker?
# This part is glitchy and does not work unfortunately.

pattern_speaker = [
    
    # SPEAKER + SPEECH VERB
    [{"POS": "PROPN", "OP": "+"}, {"ORTH": {"IN": speech_verbs}}],
    
    # SPEECH VERB + SPEAKER
    [{"ORTH": {"IN": speech_verbs}}, {"POS": "PROPN", "OP": "+"}]
                  
                  ]

speakerMatcher = Matcher(nlp.vocab)
speakerMatcher.add("SPEAKER", pattern_speaker, greedy = "LONGEST")

speakerMatches = speakerMatcher(quoteSpeakerList[2])
for match_id, start, end in speakerMatches:
    print(quoteSpeakerList[2][start:end]) 

print(quoteSpeakerList[2])

says Wises
“If Alfonso wasnt publishing local comics I dont think Xander would be doing what hes doing.” says Wises dad Darrin.
