In [1]:
# Import dependencies
from elasticsearch import Elasticsearch
import wikipediaapi
from slugify import slugify
from pprint import pprint

In [15]:
client = Elasticsearch("localhost:9200")

In [17]:
client.indices.get_alias("_all")

{'pandemics': {'aliases': {}},
 'american-science-fiction-television-series': {'aliases': {}},
 'coronaviridae': {'aliases': {}},
 'natural-language-processing': {'aliases': {}},
 'american-comics-writers': {'aliases': {}},
 'marvel-comics': {'aliases': {}},
 'marvel-comics-editors-in-chief': {'aliases': {}},
 'presidents-of-the-united-states': {'aliases': {}},
 'machine-learning': {'aliases': {}},
 '21st-century-american-comedians': {'aliases': {}}}

In [18]:
client.indices.delete('_all')
client.indices.get_alias("_all")

{}

In [None]:
mapping = {
    "properties": {
        
            "text": {
                "type": "nested",
                "properties":{
                    "section_num": {"type":"integer"},
                    "section_title": {"type":"text"},
                    "section_content": {"type":"text"}
                }
            },
        
            "title": {
                "type": "text"
            },
        
            "source": {
                "type": "text"
            },
        
            "page_id": {
                "type": "long"
            },
            
        }
    }

In [None]:
categories = ['Presidents of the United States', 
              'Marvel Comics', 
              'American comics writers',
              'Marvel Comics editors-in-chief']

In [None]:
# Load wikipedia articles from the following categories:
categories2 = ['Machine learning',
              'Natural language processing',
              'Coronaviridae',
              '21st-century American comedians',
              'Pandemics']

In [None]:
class Document:
    
    def __init__(self):
        self.title = ''
        self.page_id = None
        self.source = ''
        self.text = ''
        
    def __if_exists(self, page_id, index=""):
        '''
        A private method to check if the article already exists in the database
        with a goal to avoid duplication
        '''
        
        return client.search(index=index, 
                             body={"query": 
                                   {"match": 
                                    {"page_id": page_id}
                                   }})['hits']['total']['value']
        
    def insert(self, title, page_id, url, text, index):
        ''' Add a new document to the index'''
        
        self.title=title
        self.page_id=page_id
        self.source=url
        self.text=text
        self.body = {'title': self.title,
            'page_id': self.page_id,
            'source':self.source,
            'text': self.text}
        
        if self.__if_exists(page_id) == 0:
        
            try:
                client.index(index=index, body=self.body)
#                 print(f"Sucess! The article {self.title} was added to index {index}")
            except error:
                print("Something went wrong", error)
                
        else:
            print(f"Article {self.title} is already in the database")

In [None]:
def parse_article(article):
    ''' Parce wikipedia articles from the full article text'''
    
    text = article.text
    # get section titles for the existing sections
    section_titles = [sec.title for sec in article.sections]
    
    # initiate the sections dictionary with a summary (0th section) 
    sections = [{'section_num': 0},
                {'section_title': "Summary"},
                {'section_content': article.summary}]
    
    for i, title in enumerate(section_titles[::-1]):

        num = len(section_titles)-i
        if len(text.split(f"\n\n{title}")) == 2:
            section_dict = {"section_num": num,
                            "section_title": title,
                            "section_content": text.split(f"\n\n{title}")[-1]}
            sections.append(section_dict)
            text = text.split(f"\n\n{title}")[0]
        else:
            pass
            
        
    return sections

In [None]:
def search_insert_wiki(category, mapping):
    
    if type(category) is not list: category = [ category ]

    wiki_wiki = wikipediaapi.Wikipedia('en')
    
    for c in category:
        
        try:
                    
            '''Create and empty index with predefined data structure'''
            client.indices.create(index=slugify(c), body={"mappings":mapping})
            
            '''Access the list of wikipedia articles in category c'''
            cat = wiki_wiki.page(f"Category:{c}")
            
            ''' Parse and add articles in the category to database'''
            for key in cat.categorymembers.keys():
                page = wiki_wiki.page(key)

                if not "Category:" in page.title:

                    text = parse_article(page)
                    doc = Document()
                    doc.insert(page.title, page.pageid, page.fullurl, text, index=slugify(c))


        except error:
            '''Skip category if it alredy exists in indices'''
            print("Something went wrong", error)
            
search_insert_wiki(categories, mapping)

In [None]:
simple_wiki_doc('Coronaviridae')

client.indices.get_alias("_all")

In [None]:
client.cat.count('coronaviridae', params={"format": "json"})

In [4]:
body = {
    'size': 20,
    'query': {
        'match_all': 'Stephen Colbert'
    }}

In [5]:
res = client.search(index = '', body = body )

RequestError: RequestError(400, 'parsing_exception', '[match_all] query malformed, no start_object after query name')

In [None]:
for i, r in enumerate(res['hits']['hits'], start = 1):
    print(i, '--', r['_source']['title'])