In [25]:
import spacy
from spacy.matcher import Matcher

In [26]:
#Carrego o modelo pequeno
nlp = spacy.load("en_core_web_sm")

In [27]:
#Inicializo o matcher com o vocabulario do modelo carregado,
#Define um padrão para encontrar tokens que se parecem um email
#Adiciona o padrão ao matcher
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADRESS", [pattern])

In [28]:
#texto de exemplo e usa o matcher para encontrar o padrão
doc = nlp("This is an email adress: wmattingly@aol.com")
matches = matcher(doc)

In [29]:
#Correspondencias do matcher
print(matches)

[(2197859665807148658, 6, 7)]


In [33]:
#Converte para texto
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADRESS


In [1]:
#Abre o texto que vai ser utilizado
with open ("wiki_mlk.txt", "r") as f:
    text = f.read()

In [2]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.

King participated in and led marches for blacks' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his famous 

In [4]:
#Importa a biblioteca e carrega o modelo pequeno
import spacy
nlp = spacy.load("en_core_web_sm")

In [7]:
#Aqui ele busca por correspondencias a substantivos proprios e printa os 10 primeiros dentro do texto
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

101
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 49, 50) King


In [8]:
#A mudança no padrão faz com que o matcher pare de procurar substantivos isolados e comece a concatenar eles quando ache em sequencia 
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP" : "+"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

174
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [9]:
#Com a mudança no matcher ele encontra apenas a maior sequencia invés de encontrar todos os pronomes e ir juntando
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP" : "+"}]
matcher.add("PROPER_NOUN", [pattern], greedy = "LONGEST")
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

60
(451313080118390996, 83, 88) Martin Luther King Sr.
(451313080118390996, 469, 474) Martin Luther King Jr. Day
(451313080118390996, 536, 541) Martin Luther King Jr. Memorial
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 128, 132) Southern Christian Leadership Conference
(451313080118390996, 247, 251) Director J. Edgar Hoover
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 325, 328) Nobel Peace Prize
(451313080118390996, 422, 425) James Earl Ray
(451313080118390996, 463, 466) Congressional Gold Medal


In [10]:
#Ordena utilizando o lambda para mostrar os substantivos na ordem que aparecem dentro do texto
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP" : "+"}]
matcher.add("PROPER_NOUN", [pattern], greedy = "LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

60
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 49, 50) King
(451313080118390996, 69, 71) Mahatma Gandhi
(451313080118390996, 83, 88) Martin Luther King Sr.
(451313080118390996, 89, 90) King
(451313080118390996, 113, 114) King
(451313080118390996, 117, 118) Montgomery


In [11]:
#Agora invés de procurar apenas os substantivos, o padrão procura substantivos que precedem um verbo
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP" : "+"}, {"POS": "VERB"}]
matcher.add("PROPER_NOUN", [pattern], greedy = "LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

7
(451313080118390996, 49, 51) King advanced
(451313080118390996, 89, 91) King participated
(451313080118390996, 113, 115) King led
(451313080118390996, 167, 169) King helped
(451313080118390996, 247, 252) Director J. Edgar Hoover considered
(451313080118390996, 322, 324) King won
(451313080118390996, 485, 488) United States beginning


In [12]:
#Importando json para ler um arquivo json 
import json
with open ("alice.json", "r") as f:
    data = json.load(f)

In [13]:
#Acessando o primeiro paragŕafo do primeiro capitulo
text = data[0][2][0]
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'


In [15]:
#Substituindo onde tem "`" para "'"
text = text.replace("`", "'")
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [16]:
#Agora o padrão procura por algo que começa com aspas simples, logo depois uma letra do alfabeto e depois uma pontuação e por fim termina com aspas simples
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH" : "'"}, 
            {"IS_ALPHA": True, "OP" : "+"},
            {"IS_PUNCT": True, "OP" : "*"},
            {"ORTH" : "'"}
            ]
matcher.add("PROPER_NOUN", [pattern], greedy = "LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

2
(451313080118390996, 47, 58) 'and what is the use of a book,'
(451313080118390996, 60, 67) 'without pictures or conversation?'


In [18]:
#Aqui cada vez a busca é incrementada, agora após a frase entre aspas é procurado um verbo conjugado dos lemmas 'think, say' e um substantivo 
speak_lemmas = ["think", "say"]
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH" : "'"}, 
            {"IS_ALPHA": True, "OP" : "+"},
            {"IS_PUNCT": True, "OP" : "*"},
            {"ORTH" : "'"},
            {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
            {"POS" : "PROPN", "OP": "+"}
            ]
matcher.add("PROPER_NOUN", [pattern], greedy = "LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

1
(451313080118390996, 47, 60) 'and what is the use of a book,' thought Alice


In [19]:
#Mais uma incrementação para adicionar mais um frase cercada por aspas simples
speak_lemmas = ["think", "say"]
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH" : "'"}, 
            {"IS_ALPHA": True, "OP" : "+"},
            {"IS_PUNCT": True, "OP" : "*"},
            {"ORTH" : "'"},
            {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
            {"POS" : "PROPN", "OP": "+"},
            {"ORTH" : "'"}, 
            {"IS_ALPHA": True, "OP" : "+"},
            {"IS_PUNCT": True, "OP" : "*"},
            {"ORTH" : "'"},
            ]
matcher.add("PROPER_NOUN", [pattern], greedy = "LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [22]:
#Itera sobre o texto, troca as crases por aspas e aplica o Matcher
for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    print (len(matches))
    matches.sort(key = lambda x: x[1])
    for match in matches[:10]:
        print(match, doc[match[1]:match[2]])
    

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [23]:
#Adição de novos padrões fazendo com que encontre mais textos semelhantes
matcher = Matcher(nlp.vocab)
pattern1 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
pattern2 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}]
pattern3 = [{"POS": "PROPN", "OP": "+"},{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST')
for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])
    print (len(matches))
    for match in matches[:10]:
        print (match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
1
(3232560085755078826, 0, 6) 'Well!' thought Alice
0
0
0
0
0
0
0
1
(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice
0
0
