In [1]:
import spacy
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
import numpy as np
from nltk import Tree
from spacy import displacy
from spacy.matcher import Matcher

In [2]:
### RELATION 
### Person leaves Country

### PATTERNS
### Person, lemma leave, GEO
### Pers , lemma leave, Geo, Date
### Date, Pers, lemma leave, GEO for GEO

In [3]:
nlp = spacy.load("en_core_web_md")

In [4]:
text = """Mr. Jonson is leaving America, but it’s only temporary.
Ryan left Hungary on March 23.
A few years ago, I left China for England with young children."""

In [5]:
docs = list(nlp.pipe(sent_tokenize(text)))
for doc in docs:
    displacy.render(doc, style='ent', jupyter = True)
    displacy.render(doc, style='dep', jupyter = True)

In [6]:
matcher = Matcher(nlp.vocab, validate=True)

In [7]:
pattern_1 = [{"ENT_TYPE": "DATE", "OP": "*"},
             {"IS_PUNCT": True, "OP": "?"},
             {"POS": "PRON"},
             {"POS": "AUX", "OP": "?"},
             {"LEMMA": "leave"},
             {"ENT_TYPE": "GPE"},
             {"TEXT": "for"},
             {"ENT_TYPE": "GPE"},
             {"ENT_TYPE": "DATE", "OP": "?"}]

pattern_2 = [{"ENT_TYPE": "PERSON"},
             {"POS": "AUX", "OP": "?"},
             {"LEMMA": "leave"},
             {"ENT_TYPE": "GPE"},
             {"POS": "ADP"},
             {"ENT_TYPE": "DATE"}, {"OP": "+"}]

pattern_3 = [{"ENT_TYPE": "PERSON"},
             {"POS": "AUX", "OP": "?"},
             {"LEMMA": "leave"},
             {"ENT_TYPE": "GPE"}]

In [8]:
matcher.add("Leave for", None, pattern_1)
matcher.add("Leave_on_date", None, pattern_2)
matcher.add("Leave", None, pattern_3)

In [9]:
def find_mathes(doc):
    list_matches = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        list_matches.append(matched_span)
    max_len=0
    res = ""
    for el in list_matches:
        if len(el) > max_len:
            max_len = len(el)
            res = el 
    
    who = ""
    for token in res:
        if (token.ent_type_ == "PERSON"):
            who += token.text
    
    if who == "":
        for token in res:
            if (token.pos_ == "PRON"):
                who += token.text
    
    when_list= [token for token in res if (token.ent_type_ == "DATE")]
    when = ""
    for word in when_list:
        when += word.text
        when += " "
        
    departure = ""
    arrival = ""
    for ent in doc.ents:
        head = ent.root.head
        if head.lemma_ == "leave":
            for child in head.children:
                if child.dep_ == "dobj":
                    departure = child.text
                if child.text == "for":
                    for grandchild in child.children:
                        if grandchild.dep_ == "pobj":
                            arrival = grandchild.text
                    
    return [who, when, departure, arrival]

In [10]:
nlp.add_pipe(find_mathes)
print(nlp.pipe_names)

['tagger', 'parser', 'ner', 'find_mathes']


In [11]:
for el in list(nlp.pipe(sent_tokenize(text))):
    print("Who: " + el[0]  + "   When: " + el[1] + "    From: " + el[2] + "    To: " + el[3])

Who: Jonson   When:     From: America    To: 
Who: Ryan   When: March 23     From: Hungary    To: 
Who: I   When: A few years ago     From: China    To: England


# Part 2

In [12]:
file_book = open('my_book.txt')
my_book = file_book.read()
sents = sent_tokenize(my_book)

In [13]:
nlp = spacy.load("en_core_web_sm")
sents_spacy = list(nlp.pipe(sents))

In [14]:
X = [sent.vector for sent in sents_spacy]

In [15]:
model = KMeans(n_clusters=10, random_state=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=1, tol=0.0001, verbose=0)

In [16]:
predictions = model.predict(X)

In [17]:
cluster = {}
for i in range(0, 10):
    cluster[i] = []
for id, pred in enumerate(predictions):
    cluster[pred].append(id)

In [23]:
a = [sents_spacy[id] for id in cluster[1]]
print(a[:10])

[“Precisely., “Hum!, “Married!, “Mr., “Indeed!, II., “Alas!, In Victoria!, “‘What papers?, “7th.]


In [19]:
a = [sents_spacy[id] for id in cluster[2]]
print(a[:10])

[“Indeed, I should have thought a little more., I don’t know.”

“Quite so!, “What do you imagine that it means?”

“I have no data yet., “What do you make of that?” asked Holmes., I may want your help, and so may he., If not, I should much prefer to communicate with you alone.”

I rose to go, but Holmes caught me by the wrist and pushed me back into my chair., “You may say before this gentleman anything which you may say to me.”

The Count shrugged his broad shoulders., I am but thirty now.”

“It must be recovered.”

“We have tried and failed.”

“Your Majesty must pay., And she will do it., I know that she will do it.]


In [30]:
a = [sents_spacy[id] for id in cluster[6]]
print(a[:10])

[Come!, Well!, “Ha!, Great Scott!, “Oh, indeed!, “Oh, no, sir!, “Yes., “Oh, that!, Oh!, who?]


In [31]:
a = [sents_spacy[id] for id in cluster[8]]
print(a[:10])

[All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind., And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory., I had seen little of Holmes lately., His rooms were brilliantly lit, and, even as I looked up, I saw his tall, spare figure pass twice in a dark silhouette against the blind., He was at work again., His manner was not effusive., Just a trifle more, I fancy, Watson., You would certainly have been burned, had you lived a few centuries ago., Obviously they have been caused by someone who has very carelessly scraped round the edges of the sole in order to remove crusted mud from it., You have not observed.]
