In [1]:
import spacy
import pytextrank

# example text
text = """Compatibility of systems of linear constraints over the set of natural numbers.
Criteria of compatibility of a system of linear Diophantine equations, strict inequations,
and nonstrict inequations are considered. Upper bounds for components of a minimal set of
solutions and algorithms of construction of minimal generating sets of solutions for all types
of systems are given. These criteria and the corresponding algorithms for constructing a minimal
supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."""

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")
# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

doc = nlp(text)
# examine the top-ranked phrases in the document
for phrase in doc._.phrases:
    print(phrase.text)
    print(phrase.rank, phrase.count)
    print(phrase.chunks)

mixed types
0.16913297519493323 1
[mixed types]
minimal generating sets
0.15718190861803338 1
[minimal generating sets]
systems
0.15706451042108485 3
[systems, systems, systems]
nonstrict inequations
0.14754717053336916 1
[nonstrict inequations]
strict inequations
0.13924933655642574 1
[strict inequations]
natural numbers
0.11617695958368299 1
[natural numbers]
linear Diophantine equations
0.11604222593721583 1
[linear Diophantine equations]
solutions
0.11339988816808184 3
[solutions, solutions, solutions]
linear constraints
0.10253057199702881 1
[linear constraints]
all the considered types systems
0.08793429413519571 1
[all the considered types systems]
a minimal set
0.08280312704561263 1
[a minimal set]
algorithms
0.08242870171999762 1
[algorithms]
construction
0.07382060769521386 1
[construction]
a system
0.0725449898974001 1
[a system]
Diophantine
0.07111006100211886 1
[Diophantine]
all types
0.06977165930609519 1
[all types]
a minimal
supporting set
0.06824144622770673 1
[a minim

In [2]:
#Echemos un vistazo a esta canalización ahora...
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'textrank']

In [3]:
# Podemos examinar la spaCytubería con mucho más detalle...
nlp.analyze_pipes(pretty=True)

[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'textrank': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  't

In [4]:
#A continuación, carguemos texto de un documento desde mi equipo :
from icecream import ic
import pathlib

text = pathlib.Path("C:/Users/Dellç/Desktop/Informatica.txt").read_text()
text


'La InformÃ¡tica es la rama de la IngenierÃ\xada que estudia el hardware, las redes de datos y el software necesarios para tratar informaciÃ³n de forma automÃ¡tica. Aunque pueda parecerte una definiciÃ³n muy abstracta, estamos seguros de que sabes mucho mÃ¡s de InformÃ¡tica de lo que crees. Y si no, sigue leyendo un poco mÃ¡s.\nSeguro que te suena quÃ© es el hardware. Y si no te suena, seguro que has utilizado hardware en muchas ocasiones sin saber que se llama asÃ\xad. El hardware son los ordenadores de sobremesa, los portÃ¡tiles, los tablets, los telÃ©fonos mÃ³viles, las impresoras, las consolas de videojuegos, los lectores de DVDs, los reproductores de mÃºsica, etcÃ©tera. Â¿A que sÃ\xad que sabÃ\xadas quÃ© es el hardware? Lo que quizÃ¡ no sabÃ\xadas es que estos aparatos estÃ¡n formados internamente por componentes electrÃ³nicos a los que tambiÃ©n se les llama hardware: Â¿te suenan el microprocesador, las tarjetas de memoria, las tarjetas grÃ¡ficas, los discos duros o los acelerÃ³me

In [5]:
#Longitud del texto cargado
doc = nlp(text)
len(doc)

258

In [6]:
#acceder al componente PyTextRank dentro de la spaCycanalización 
#usarlo para obtener más información para el procesamiento posterior del documento
tr = doc._.textrank
ic(tr.elapsed_time);

ic| tr.elapsed_time: 0.0


In [7]:
for phrase in doc._.phrases:
    ic(phrase.rank, phrase.count, phrase.text)
    ic(phrase.chunks)

ic| phrase.rank: 0.12084959385708814
    phrase.count: 2
    phrase.text: 'los lectores de DVDs'
ic| phrase.chunks: [los lectores de DVDs, los lectores de DVDs]
ic| phrase.rank: 0.12084933746609591
    phrase.count: 1
    phrase.text: 'los telÃ©fonos mÃ³viles'
ic| phrase.chunks: [los telÃ©fonos mÃ³viles]
ic| phrase.rank: 0.12039557687926086
    phrase.count: 1
    phrase.text: 'los reproductores de mÃºsica'
ic| phrase.chunks: [los reproductores de mÃºsica]
ic| phrase.rank: 0.12026071863463483
    phrase.count: 1
    phrase.text: 'los ordenadores de sobremesa'
ic| phrase.chunks: [los ordenadores de sobremesa]
ic| phrase.rank: 0.12012628961194063
    phrase.count: 6
    phrase.text: 'los'
ic| phrase.chunks: [los, los, los, los, los, los]
ic| phrase.rank: 0.1198230470372881
    phrase.count: 1
    phrase.text: 'los semÃ¡foros'
ic| phrase.chunks: [los semÃ¡foros]
ic| phrase.rank: 0.11892416238723205
    phrase.count: 2
    phrase.text: 'los portÃ¡tiles'
ic| phrase.chunks: [los portÃ¡tiles,

    phrase.text: 'Aunque pueda parecerte una'
ic| phrase.chunks: [Aunque pueda parecerte una]
ic| phrase.rank: 0.03508413039924136
    phrase.count: 1
    phrase.text: 'estamos'
ic| phrase.chunks: [estamos]
ic| phrase.rank: 0.03334302162271478
    phrase.count: 2
    phrase.text: 'quÃ'
ic| phrase.chunks: [quÃ, quÃ]
ic| phrase.rank: 0.029684891318027366
    phrase.count: 1
    phrase.text: 'seguro'
ic| phrase.chunks: [seguro]
ic| phrase.rank: 0.028606448553700946
    phrase.count: 1
    phrase.text: 'un'
ic| phrase.chunks: [un]
ic| phrase.rank: 0.02720005179766699
    phrase.count: 1
    phrase.text: 'informaciÃ³n de forma automÃ¡tica'
ic| phrase.chunks: [informaciÃ³n de forma automÃ¡tica]
ic| phrase.rank: 0.02620434031437227
    phrase.count: 1
    phrase.text: 'Y aunque'
ic| phrase.chunks: [Y aunque]
ic| phrase.rank: 0.019774743540194373
    phrase.count: 1
    phrase.text: 'de forma automÃ¡tica'
ic| phrase.chunks: [de forma automÃ¡tica]
ic| phrase.rank: 0.01967896744457015
    phrase

In [8]:
text = pathlib.Path("C:/Users/Dellç/Desktop/Informatica.txt").read_text()
doc = nlp(text)

for phrase in doc._.phrases[:10]:
    ic(phrase)


ic| phrase: Phrase(text='los lectores de DVDs',
                   chunks=[los lectores de DVDs, los lectores de DVDs],
                   count=2,
                   rank=0.12084959385708814)
ic| phrase: Phrase(text='los telÃ©fonos mÃ³viles',
                   chunks=[los telÃ©fonos mÃ³viles],
                   count=1,
                   rank=0.12084933746609591)
ic| phrase: Phrase(text='los reproductores de mÃºsica',
                   chunks=[los reproductores de mÃºsica],
                   count=1,
                   rank=0.12039557687926086)
ic| phrase: Phrase(text='los ordenadores de sobremesa',
                   chunks=[los ordenadores de sobremesa],
                   count=1,
                   rank=0.12026071863463483)
ic| phrase: Phrase(text='los',
                   chunks=[los, los, los, los, los, los],
                   count=6,
                   rank=0.12012628961194063)
ic| phrase: Phrase(text='los semÃ¡foros',
                   chunks=[los semÃ¡foros],
        