# Textacy & Spacy
Librerias de procesado de NLP

In [4]:
'''
Versiones compatibles
Despues hay que reiniciar el entorno de ejecución
'''
# !pip install spacy
# !pip install textacy

# # Para siguientes ejecuciones, solo ejecutar esto y reiniciar despues el entorno
# !python -m spacy download en_core_web_lg
# !python -m spacy download es_core_news_lg


Collecting es-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.7.0/es_core_news_lg-3.7.0-py3-none-any.whl (568.0 MB)
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/568.0 MB ? eta -:--:--
     ------------

# Spacy
https://spacy.io/

NOTA: Recuerda reiniciar el entorno de ejecucion despues de la instalacion

In [5]:
import spacy

nlp = spacy.load('en_core_web_lg')

## Text basics
Veamos como trabajar cn estos primeros ejemplos con la libreria ´spacy´. Cosas que podemos hacer:
1. Tokenizar en frases
2. Tokenizar en palabras
3. Acceder a los atributos de cada token
4. Acceder a las entidades del texto
5. Visualizar las entidades del texto

In [6]:
# The text we want to examine
text = """
London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.
"""

doc = nlp(text)

print(doc)

for num, sentence in enumerate(doc.sents):
  print(num, sentence)



London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.

0 
London is the capital and most populous city of England and 
the United Kingdom.  
1 Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia.
2 It was founded by the Romans, who named it Londinium.



In [7]:
for word in doc[:20]:
  print(word.text, word.lemma_, word.pos_, word.is_stop)


 
 SPACE False
London London PROPN False
is be AUX True
the the DET True
capital capital NOUN False
and and CCONJ True
most most ADV True
populous populous ADJ False
city city NOUN False
of of ADP True
England England PROPN False
and and CCONJ True

 
 SPACE False
the the DET True
United United PROPN False
Kingdom Kingdom PROPN False
. . PUNCT False
    SPACE False
Standing stand VERB False
on on ADP True


In [8]:
type(doc)

spacy.tokens.doc.Doc

## Análisis sintáctico
Hacer los deberes

In [10]:
from spacy import displacy

doc2 = nlp("London is the capital and most populous city of England and the United Kingdom")
displacy.render(doc2, jupyter=True, style="dep")

## Entidades en texto

In [11]:
for entity in doc.ents:
  print(entity.text, entity.label_)

London GPE
England GPE
the United Kingdom GPE
the River Thames LOC
south east LOC
Great Britain GPE
London GPE
two millennia DATE
Romans NORP
Londinium GPE


In [14]:
# Doubts with labels
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [15]:
displacy.render(doc, style='ent', jupyter=True)

## Sustitución de nombres
Ocultar nombres para GDPR

In [16]:
# Replace a token with "REDACTED" if it is a name
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "GDPR"
    else:
        return token.text

# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_name_with_placeholder, doc)
    return " ".join(tokens)

s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence". In 1957, Noam Chomsky’s 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.
"""

print(scrub(s))


 In 1950 , GDPR published his famous article " Computing Machinery and Intelligence " . In 1957 , GDPR ’s 
 Syntactic Structures revolutionized Linguistics with ' universal grammar ' , a rule based system of syntactic structures . 



## Lematize

In [18]:
for w in doc:
  print(w.text, w.lemma_, w.pos_)


 
 SPACE
London London PROPN
is be AUX
the the DET
capital capital NOUN
and and CCONJ
most most ADV
populous populous ADJ
city city NOUN
of of ADP
England England PROPN
and and CCONJ

 
 SPACE
the the DET
United United PROPN
Kingdom Kingdom PROPN
. . PUNCT
    SPACE
Standing stand VERB
on on ADP
the the DET
River River PROPN
Thames Thames PROPN
in in ADP
the the DET
south south PROPN
east east PROPN

 
 SPACE
of of ADP
the the DET
island island NOUN
of of ADP
Great Great PROPN
Britain Britain PROPN
, , PUNCT
London London PROPN
has have AUX
been be AUX
a a DET
major major ADJ
settlement settlement NOUN

 
 SPACE
for for ADP
two two NUM
millennia millennium NOUN
. . PUNCT
It it PRON
was be AUX
founded found VERB
by by ADP
the the DET
Romans Romans PROPN
, , PUNCT
who who PRON
named name VERB
it it PRON
Londinium Londinium PROPN
. . PUNCT

 
 SPACE


## Stopwords

In [19]:
from spacy.lang.en.stop_words import STOP_WORDS

print(list(STOP_WORDS)[:39])

['ever', '‘ll', 'on', 'above', 'became', 'them', 'serious', '‘m', 'ours', 'beyond', 'off', 'does', 'along', 'while', 'nine', 'sometimes', 'neither', 'mine', 'something', '‘ve', 'less', '’re', 'yours', 'she', 'using', 'even', 'against', 're', 'me', '’s', 'six', 'bottom', '’ll', 'into', 'he', 'one', 'be', 'both', 'empty']


In [20]:
lista_clean = [palabra for palabra in doc if not palabra.is_stop and not palabra.is_punct]
print(lista_clean)

[
, London, capital, populous, city, England, 
, United, Kingdom,  , Standing, River, Thames, south, east, 
, island, Great, Britain, London, major, settlement, 
, millennia, founded, Romans, named, Londinium, 
]


# Español
#Español y entidades

In [21]:
nlp_es = spacy.load('es_core_news_lg')

text = '''Londres (en inglés, London, pronunciado /ˈlʌndən/ ( escuchar)) es la capital y mayor ciudad de Inglaterra y del Reino Unido.2​3​ Situada a orillas del río Támesis, Londres es un importante asentamiento humano desde que fue fundada por los romanos con el nombre de Londinium hace casi dos milenios.4​ El núcleo antiguo de la urbe, la City de Londres, conserva básicamente su perímetro medieval de una milla cuadrada. Desde el siglo XIX el nombre «Londres» también hace referencia a toda la metrópolis desarrollada alrededor de este núcleo.5​ El grueso de esta conurbación forma la región de Londres y el área administrativa del Gran Londres,6​ gobernado por el alcalde y la asamblea de Londres.7​
Londres es una ciudad global, uno de los centros neurálgicos en el ámbito de las artes, el comercio, la educación, el entretenimiento, la moda, las finanzas, los medios de comunicación, la investigación, el turismo o el transporte.8​ Es el principal centro financiero del mundo9​10​11​ y una de las áreas metropolitanas con mayor PIB.12​13​ Londres es también una capital cultural mundial,14​15​16​17​ la ciudad más visitada considerando el número de visitas internacionales18​ y tiene el mayor sistema aeroportuario del mundo según el tráfico de pasajeros.19​ Asimismo, las 43 universidades de la ciudad conforman la mayor concentración de centros de estudios superiores de toda Europa.20​ En el año 2012 Londres se convirtió en la única ciudad en albergar la celebración de tres Juegos Olímpicos de Verano.21​
En esta ciudad multirracial convive gente de un gran número de culturas que hablan más de trescientos idiomas distintos.22​ La Autoridad del Gran Londres estima que en 2015 la ciudad tiene 8,63 millones de habitantes,23​ que supone el 12,5 % del total de habitantes del Reino Unido.24​ El área urbana del Gran Londres, con 10 470 00025​ habitantes, es la segunda más grande de Europa, pero su área metropolitana, con una población estimada de entre 12 y 14 millones,26​27​ es la mayor del continente. Desde 1831 a 1925 Londres, como capital del Imperio británico, fue la ciudad más poblada del mundo.'''
doc = nlp_es(text)

# Para ver las entidades que ha detectado
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")


Londres (LOC)
London (LOC)
Inglaterra (LOC)
Reino Unido.2​3​ Situada (LOC)
Támesis (LOC)
Londres (LOC)
Londinium (LOC)
City de Londres (LOC)
Londres (LOC)
Londres (LOC)
Gran Londres,6​ (LOC)
Londres (LOC)
mundo9​10​11​ (LOC)
Londres (LOC)
Londres (LOC)
Juegos Olímpicos de Verano.21​
En esta ciudad multirracial (MISC)
La Autoridad del Gran Londres (MISC)
Reino Unido.24​ El área urbana del (LOC)
Gran Londres (LOC)
Europa (LOC)
millones,26​27​ (PER)
Londres (LOC)
Imperio británico (LOC)


In [22]:
# Doubts with labels
spacy.explain('MISC')

'Miscellaneous entities, e.g. events, nationalities, products or works of art'

## Palabras más frecuentes
En una página de Wikipedia

In [23]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py): started
  Building wheel for wikipedia (setup.py): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11704 sha256=3340527e42be43f70ea5c8b974c0920fa86bbffc1e79413bbced6b6aedb6ebc0
  Stored in directory: c:\users\ortiz\appdata\local\pip\cache\wheels\8f\ab\cb\45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [24]:
import wikipedia
wikipedia.set_lang("en")
pedro = wikipedia.page(title="Karim Benzema")
text = pedro.content
text[0:1000]

"Karim Mostafa Benzema (Arabic: كريم مصطفى بن زيما; born 19 December 1987) is a French professional footballer who plays as a striker for and captains Saudi Pro League club Al-Ittihad. Regarded as one of the best strikers of all time, he is a creative forward known for his technical skills, vision and versatility on the field, Benzema is Real Madrid's all-time second-highest goalscorer and top assist provider. He won 24 trophies with Real Madrid, including four La Liga, three Copa del Rey, and five UEFA Champions League titles.\nBorn in Lyon to parents of Algerian descent, Benzema began his career with hometown club Lyon in 2005, contributing sporadically to three Ligue 1 title wins. In 2008, he was named the league's Player of the Year and in the Team of the Year having finished as the league's top goalscorer and winning his fourth league title and first Coupe de France. In 2009, Benzema was the subject of a then-French record football transfer when he joined Real Madrid in a deal wor

In [25]:
nombres = [w.text for w in nlp(text) if not w.is_stop and not w.is_punct and w.pos_ == 'ADJ']
nombres[:10]

['French',
 'professional',
 'best',
 'creative',
 'technical',
 'highest',
 'Algerian',
 'fourth',
 'French',
 'worth']

In [26]:
from collections import Counter
word_freq = Counter(nombres)

word_freq.most_common(10)[1:]

[('final', 31),
 ('French', 30),
 ('new', 18),
 ('national', 17),
 ('international', 12),
 ('consecutive', 10),
 ('senior', 10),
 ('highest', 9),
 ('friendly', 9)]

## Texto Textacy Londres
No queremos saber cosas sobre Londres, de nuestro texto anterior

In [27]:
import spacy
import textacy.extract

#### CODE ####

# The text we want to examine
text = """London is the capital and most populous city of England and the United Kingdom. Standing on the 
River Thames in the south east of the island of Great Britain, London has been a major settlement for two millennia. 
London was founded by the Romans, who named it Londinium. London is a huge city. London has a lot of cute restaurants.
"""

doc = nlp(text)
statements = textacy.extract.semistructured_statements(doc, entity="London", cue='be')

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")


Here are the things I know about London:
 - [the, capital, and, most, populous, city, of, England, and, the, United, Kingdom]
 - [a, major, settlement, for, two, millennia]
 - [a, huge, city]


## Textacy con Wikipedia API
No queremos las palabras más frecuentes, queremos frases sobre Londres

In [28]:
wikipedia.set_lang("en")
london = wikipedia.page("London")
text = london.content

In [29]:
doc = nlp(text)

# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, entity="London", cue='be')

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:
 - [the, capital, and, largest, city, of, England]
 - [one, of, Europe, 's, most, economically, powerful, cities]
 - [the, most, visited, city, in, Europe]
 - [an, ancient, name]
 - [the, largest, town, in, England]
 - [the, focus, of, the, Peasants, ', Revolt, in, 1381.London]
 - [the, world, 's, largest, city]
 - [the, seat, of, the, Government, of, the, United, Kingdom]
 - [one, of, the, World, 's, Greenest, Cities, ", with, more, than, 40, per, cent, green, space, or, open, water]
 - [the, second, most, populous, metropolitan, area, in, Europe]
 - [the, 19th, largest, city, and, the, 18th, largest, metropolitan, region]
 - [the, world, 's, most, expensive, office, market]
 - [one, of, the, pre, -, eminent, financial, centres, of, the, world, as, the, most, important, location, for, international, finance]
 - [the, leading, financial, centre]
 - [one, of, the, world, 's, largest, retail, destinations, ,]
 - [one, of, the, leading, tourist, de