# NER DATA Fetch

Recognize Named Entities using NER and fetch corresponding Wikipedia articles for them

<h3>NER methods</h3>

In [None]:
import nltk

nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from pattern.text.en import singularize

# stop_words = set(stopwords.words('english'))

def get_pos(raw_text, pos_list):
    tokenized = sent_tokenize(raw_text)
    nouns = []

    for sentence in tokenized:
        tagged = tag_sentence(sentence)
        for tag in tagged:
            if tag[1] in pos_list:
                nouns.append(tag[0])

    return nouns

In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import wikipedia
from wikipedia import PageError

wikipedia.API_URL = 'https://'

nlp = en_core_web_sm.load()

2021-12-19 00:37:48.303368: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-19 00:37:48.303410: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [12]:
doc = nlp("Can you believe Pakistan beat Sri Lanka?")
ner_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']
print([(X.text, X.label_) if X.label_ in ner_list else "" for X in doc.ents])

[('Pakistan', 'GPE'), ('Sri Lanka', 'GPE')]


In [None]:
for pos in doc.ents:
    if pos.label_ in ner_list:
        print((pos.text, pos.label_))
        try:
            retrieved_summary = wikipedia.summary(pos.text, sentences=20)
        except PageError:
            print(f"[wiki_scraper]: Topic does not exist / is ambiguous")

        print(retrieved_summary, "\n\n")

In [2]:
class KnowledgeSourceController:
    def __init__(self):
        self.knowledge_dictionary = {}
    
    def retrieve_knowledge(self, topic):
        if topic in self.knowledge_dictionary:
            return self.knowledge_dictionary[topic]
        try:
            print(f"fetching data from wikipedia")
            retrieved_summary = wikipedia.summary(pos.text, sentences=20)
            print(f"data fetched successfully")
        except PageError:
            print(f"[wiki_scraper]: Topic does not exist / is ambiguous")
            return None
        
        self.knowledge_dictionary[topic] = retrieved_summary
        print(f"len: {len(self.knowledge_dictionary)}")
    
    def pr(self):
        print(self.knowledge_dictionary)
    
    def clear_knowledge(self):
        self.knowledge_dictionary = []

In [3]:
ksc = KnowledgeSourceController()

In [None]:
ksc.retrieve_knowledge("Formula One")

In [None]:
ksc.retrieve_knowledge("The Football World Cup")

In [None]:
ksc.pr()

In [None]:
ksc.clear_knowledge()

In [None]:
ksc.pr()