In [8]:
import flashtext
from nltk.tokenize import sent_tokenize

In [9]:
text = """An algorithm combining neural networks with fundamental parameters.
An algorithm combining neural networks with the fundamental parameters equations (NNFP) is proposed for making
corrections for non-linear matrix effects in x-ray fluorescence analysis. In the algorithm, neural networks were
applied to relate the concentrations of components to both the measured intensities and the relative theoretical
intensities calculated by the fundamental parameter equations. The NNFP algorithm is compared with the classical
theoretical correction models, including the fundamental parameters approach, the Lachance-Traill model, a
hyperbolic function model and the COLA algorithm. For an alloy system with 15 measured elements, in most cases,
the prediction errors of the NNFP algorithm are lower than those of the fundamental parameters approach, the
Lachance-Traill model, the hyperbolic function model and the COLA algorithm separately. If there are the serious
matrix effects, such as matrix effects among Cr, Fe and Ni, the NNFP algorithm generally decreased predictive
errors as compared with the classical models, except for the case of Cr by the fundamental parameters approach.
The main reason why the NNFP algorithm has generally a better predictive ability than the classical theoretical
correction models might be that neural networks can better calibrate the non-linear matrix effects in a complex
multivariate system.""".replace("\n", " ")


# TfIdf
tf–idf (also TF*IDF, TFIDF, TF–IDF, or Tf–idf), short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. tf–idf is one of the most popular term-weighting schemes today. A survey conducted in 2015 showed that 83% of text-based recommender systems in digital libraries use tf–idf.

Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields, including text summarization and classification.

One of the simplest ranking functions is computed by summing the tf–idf for each query term; many more sophisticated ranking functions are variants of this simple model

![image.png](attachment:image.png)

In [1]:
import pke

extractor = pke.unsupervised.TfIdf()        
extractor.load_document(input=text , language="en")       
extractor.candidate_selection()             
extractor.candidate_weighting()             
keyphrases = extractor.get_n_best(n=15)

KeyError: 'hinglish'

In [11]:
for i , (candidates, score) in  enumerate(keyphrases):
    print (f"Rank {i+1} Phrase: ({candidates}) with score ({score})")

Rank 1 Phrase: (fundamental parameters) with score (37.07945454008961)
Rank 2 Phrase: (the fundamental parameters) with score (35.89954545007467)
Rank 3 Phrase: (nnfp) with score (35.89954545007467)
Rank 4 Phrase: (matrix effects) with score (28.719636360059738)
Rank 5 Phrase: (the nnfp) with score (28.719636360059738)
Rank 6 Phrase: (the nnfp algorithm) with score (28.719636360059738)
Rank 7 Phrase: (nnfp algorithm) with score (28.719636360059738)
Rank 8 Phrase: (fundamental parameters approach) with score (21.539727270044803)
Rank 9 Phrase: (parameters approach) with score (21.539727270044803)
Rank 10 Phrase: (neural networks) with score (19.43192398051029)
Rank 11 Phrase: (the fundamental) with score (19.28990497563786)
Rank 12 Phrase: (neural) with score (18.379786357175114)
Rank 13 Phrase: (an algorithm combining) with score (14.359818180029869)
Rank 14 Phrase: (algorithm combining) with score (14.359818180029869)
Rank 15 Phrase: (algorithm combining neural) with score (14.3598181

In [12]:
for i, candidate in enumerate(extractor.candidates):
    
    # print out the candidate id, its stemmed form 
    print("candidate {}: {} (stemmed form)".format(i, candidate))
    
    # print out the surface forms of the candidate
    print(" - surface forms:", [ " ".join(u) for u in extractor.candidates[candidate].surface_forms])
    
    # print out the corresponding offsets
    print(" - offsets:", extractor.candidates[candidate].offsets)
    
    # print out the corresponding sentence ids
    print(" - sentence_ids:", extractor.candidates[candidate].sentence_ids)
    
    # print out the corresponding PoS patterns
    print(" - pos_patterns:", extractor.candidates[candidate].pos_patterns)

candidate 0: an algorithm (stemmed form)
 - surface forms: ['An algorithm', 'An algorithm']
 - offsets: [0, 9]
 - sentence_ids: [0, 1]
 - pos_patterns: [['DET', 'NOUN'], ['DET', 'NOUN']]
candidate 1: an algorithm combin (stemmed form)
 - surface forms: ['An algorithm combining', 'An algorithm combining']
 - offsets: [0, 9]
 - sentence_ids: [0, 1]
 - pos_patterns: [['DET', 'NOUN', 'VERB'], ['DET', 'NOUN', 'VERB']]
candidate 2: algorithm (stemmed form)
 - surface forms: ['algorithm', 'algorithm', 'algorithm', 'algorithm', 'algorithm', 'algorithm', 'algorithm', 'algorithm', 'algorithm']
 - offsets: [1, 10, 42, 73, 102, 123, 147, 171, 201]
 - sentence_ids: [0, 1, 2, 3, 3, 4, 4, 5, 6]
 - pos_patterns: [['NOUN'], ['NOUN'], ['NOUN'], ['NOUN'], ['NOUN'], ['NOUN'], ['NOUN'], ['PROPN'], ['PROPN']]
candidate 3: algorithm combin (stemmed form)
 - surface forms: ['algorithm combining', 'algorithm combining']
 - offsets: [1, 10]
 - sentence_ids: [0, 1]
 - pos_patterns: [['NOUN', 'VERB'], ['NOUN', 'V

 - pos_patterns: [['VERB', 'DET']]
candidate 141: includ the fundament (stemmed form)
 - surface forms: ['including the fundamental']
 - offsets: [83]
 - sentence_ids: [3]
 - pos_patterns: [['VERB', 'DET', 'ADJ']]
candidate 142: fundament paramet approach (stemmed form)
 - surface forms: ['fundamental parameters approach', 'fundamental parameters approach', 'fundamental parameters approach']
 - offsets: [85, 130, 191]
 - sentence_ids: [3, 4, 5]
 - pos_patterns: [['ADJ', 'NOUN', 'VERB'], ['ADJ', 'NOUN', 'VERB'], ['ADJ', 'NOUN', 'VERB']]
candidate 143: paramet approach (stemmed form)
 - surface forms: ['parameters approach', 'parameters approach', 'parameters approach']
 - offsets: [86, 131, 192]
 - sentence_ids: [3, 4, 5]
 - pos_patterns: [['NOUN', 'VERB'], ['NOUN', 'VERB'], ['NOUN', 'VERB']]
candidate 144: approach (stemmed form)
 - surface forms: ['approach', 'approach', 'approach']
 - offsets: [87, 132, 193]
 - sentence_ids: [3, 4, 5]
 - pos_patterns: [['VERB'], ['VERB'], ['VERB']]
c

 - sentence_ids: [6]
 - pos_patterns: [['NOUN']]
candidate 321: multivari system (stemmed form)
 - surface forms: ['multivariate system']
 - offsets: [231]
 - sentence_ids: [6]
 - pos_patterns: [['NOUN', 'NOUN']]


In [13]:
for i, sentence in enumerate(extractor.sentences):
    
    # print out the sentence id, its tokens, its stems and the corresponding Part-of-Speech tags
    print("sentence {}:".format(i))
    print(" - words: {} ...".format(' '.join(sentence.words[:15])))
    print(" - stems: {} ...".format(' '.join(sentence.stems[:15])))
    print(" - PoS: {} ...".format(' '.join(sentence.pos[:15])))

sentence 0:
 - words: An algorithm combining neural networks with fundamental parameters . ...
 - stems: an algorithm combin neural network with fundament paramet . ...
 - PoS: DET NOUN VERB ADJ NOUN ADP ADJ NOUN PUNCT ...
sentence 1:
 - words: An algorithm combining neural networks with the fundamental parameters equations ( NNFP ) is proposed ...
 - stems: an algorithm combin neural network with the fundament paramet equat ( nnfp ) is propos ...
 - PoS: DET NOUN VERB ADJ NOUN ADP DET ADJ NOUN NOUN PUNCT PROPN PUNCT AUX VERB ...
sentence 2:
 - words: In the algorithm , neural networks were applied to relate the concentrations of components to ...
 - stems: in the algorithm , neural network were appli to relat the concentr of compon to ...
 - PoS: ADP DET NOUN PUNCT ADJ NOUN AUX VERB PART VERB DET NOUN ADP NOUN ADP ...
sentence 3:
 - words: The NNFP algorithm is compared with the classical theoretical correction models , including the fundamental ...
 - stems: the nnfp algorithm is com

# TopicRank() : 
TopicRank relies on a graph-based topical representation of the input document, and uses a random walk algorithm derived from PageRank to estimate the importance of each topic (node). The most representative phrase candidates belonging to the highest-scored topics are then selected as keyphrases.

In [14]:
extractor = pke.unsupervised.TopicRank()
extractor.load_document(input=text , language="en")       
extractor.candidate_selection()             
extractor.candidate_weighting()             
keyphrases = extractor.get_n_best(n=15)

In [15]:
for score , candidates in  enumerate(keyphrases):
    print (score , candidates)

0 ('fundamental parameters', 0.12424600562881949)
1 ('classical theoretical correction models', 0.10407674071492379)
2 ('nnfp', 0.10318444927545534)
3 ('algorithm', 0.09999106256213616)
4 ('neural networks', 0.07976057504649132)
5 ('prediction errors', 0.06064748574382946)
6 ('lachance', 0.052995725482185424)
7 ('hyperbolic function model', 0.051823188988597356)
8 ('serious matrix effects', 0.033542483598842696)
9 ('components', 0.031531292689163604)
10 ('concentrations', 0.030447059704182322)
11 ('lower', 0.030073298040601673)
12 ('measured intensities', 0.02982044836046606)
13 ('relative theoretical intensities', 0.02919869464112776)
14 ('alloy system', 0.027918633694592743)


In [16]:
for i, sentence in enumerate(extractor.sentences):
    
    # print out the sentence id, its tokens, its stems and the corresponding Part-of-Speech tags
    print("sentence {}:".format(i))
    print(" - words: {} ...".format(' '.join(sentence.words[:15])))
    print(" - stems: {} ...".format(' '.join(sentence.stems[:15])))
    print(" - PoS: {} ...".format(' '.join(sentence.pos[:15])))

sentence 0:
 - words: An algorithm combining neural networks with fundamental parameters . ...
 - stems: an algorithm combin neural network with fundament paramet . ...
 - PoS: DET NOUN VERB ADJ NOUN ADP ADJ NOUN PUNCT ...
sentence 1:
 - words: An algorithm combining neural networks with the fundamental parameters equations ( NNFP ) is proposed ...
 - stems: an algorithm combin neural network with the fundament paramet equat ( nnfp ) is propos ...
 - PoS: DET NOUN VERB ADJ NOUN ADP DET ADJ NOUN NOUN PUNCT PROPN PUNCT AUX VERB ...
sentence 2:
 - words: In the algorithm , neural networks were applied to relate the concentrations of components to ...
 - stems: in the algorithm , neural network were appli to relat the concentr of compon to ...
 - PoS: ADP DET NOUN PUNCT ADJ NOUN AUX VERB PART VERB DET NOUN ADP NOUN ADP ...
sentence 3:
 - words: The NNFP algorithm is compared with the classical theoretical correction models , including the fundamental ...
 - stems: the nnfp algorithm is com

In [17]:
for i, candidate in enumerate(extractor.candidates):
    
    # print out the candidate id, its stemmed form 
    print("candidate {}: {} (stemmed form)".format(i, candidate))
    
    # print out the surface forms of the candidate
    print(" - surface forms:", [ " ".join(u) for u in extractor.candidates[candidate].surface_forms])
    
    # print out the corresponding offsets
    print(" - offsets:", extractor.candidates[candidate].offsets)
    
    # print out the corresponding sentence ids
    print(" - sentence_ids:", extractor.candidates[candidate].sentence_ids)
    
    # print out the corresponding PoS patterns
    print(" - pos_patterns:", extractor.candidates[candidate].pos_patterns)

candidate 0: algorithm (stemmed form)
 - surface forms: ['algorithm', 'algorithm', 'algorithm']
 - offsets: [1, 10, 42]
 - sentence_ids: [0, 1, 2]
 - pos_patterns: [['NOUN'], ['NOUN'], ['NOUN']]
candidate 1: neural network (stemmed form)
 - surface forms: ['neural networks', 'neural networks', 'neural networks', 'neural networks']
 - offsets: [3, 12, 44, 217]
 - sentence_ids: [0, 1, 2, 6]
 - pos_patterns: [['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN']]
candidate 2: fundament paramet (stemmed form)
 - surface forms: ['fundamental parameters', 'fundamental parameters', 'fundamental parameters', 'fundamental parameters']
 - offsets: [6, 85, 130, 191]
 - sentence_ids: [0, 3, 4, 5]
 - pos_patterns: [['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN']]
candidate 3: fundament paramet equat (stemmed form)
 - surface forms: ['fundamental parameters equations', 'fundamental parameter equations']
 - offsets: [16, 67]
 - sentence_ids: [1, 2]
 - pos_patterns: [['

In [18]:


def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

sentences = tokenize_sentences(summarized_text)
keyword_sentence_mapping = get_sentences_for_keyword(filtered_keys, sentences)
        
print (keyword_sentence_mapping)

NameError: name 'summarized_text' is not defined