In [1]:
import os
import json
import sys
from elasticsearch import Elasticsearch
import wikipediaapi
from slugify import slugify
from pprint import pprint

In [2]:
client = Elasticsearch("http://localhost:9200")

## Working with _indices API

In [3]:
# Check database for existing indices
client.indices.get_alias("_all")

{'natural-language-processing': {'aliases': {}},
 'marvel-comics': {'aliases': {}},
 'pandemics': {'aliases': {}},
 'presidents-of-the-united-states': {'aliases': {}},
 'coronaviridae': {'aliases': {}},
 'machine-learning': {'aliases': {}},
 '21st-century-american-comedians': {'aliases': {}},
 'american-comics-writers': {'aliases': {}},
 'marvel-comics-editors-in-chief': {'aliases': {}}}

In [4]:
# Check an index mapping
pprint(client.indices.get_mapping("coronaviridae"))

{'coronaviridae': {'mappings': {'properties': {'page_id': {'type': 'long'},
                                               'source': {'type': 'text'},
                                               'text': {'properties': {'section_content': {'type': 'text'},
                                                                       'section_num': {'type': 'integer'},
                                                                       'section_title': {'type': 'text'}},
                                                        'type': 'nested'},
                                               'title': {'type': 'text'}}}}}


In [5]:
# Check an index mapping
pprint(client.indices.get_mapping("american-comics-writers"))

{'american-comics-writers': {'mappings': {'properties': {'page_id': {'type': 'long'},
                                                         'source': {'fields': {'keyword': {'ignore_above': 256,
                                                                                           'type': 'keyword'}},
                                                                    'type': 'text'},
                                                         'text': {'fields': {'keyword': {'ignore_above': 256,
                                                                                         'type': 'keyword'}},
                                                                  'type': 'text'},
                                                         'title': {'fields': {'keyword': {'ignore_above': 256,
                                                                                          'type': 'keyword'}},
                                                                   'type': 'text

In [None]:
# Delete al documents in specified indices. Can be a list. 
client.indices.delete(index="american-comics-writers")

In [None]:
client.indices.get_alias("_all")

In [None]:
client.indices.delete(index=["natural-languge-processing", "presidents-of-the-united-states"])
client.indices.get_alias("_all")

In [None]:
client.indices.delete("_all")
client.indices.get_alias("_all")

## Create new indices and documents

Indices = collections of documents. Indices can be created before the documents are added to them.

### Collect Wikipedia articles
We pull all wikipedia articles from selected categories from wikipedia to create our database of artices in Elasticsearch.

In [None]:
# wiki_wiki = wikipediaapi.Wikipedia('en')

In [None]:
# def print_categorymembers(categorymembers, level=0, max_level=2):
#         for c in categorymembers.values():
#             print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
#             if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
#                 print_categorymembers(c.categorymembers, level=level + 1, max_level=max_level)


# cat = wiki_wiki.page(f"Category:{'Pandemics'}")
# print_categorymembers(cat.categorymembers)

### Let's create a few collections (indices) with default mapping

In this case mapping does not have to be specified. Elaticsearch will automatically data types to the document fields and will choose the number of shards in settings

In [None]:
categories = ['Presidents of the United States', 
                 'Marvel Comics', 
                 'American comics writers',
                 'Marvel Comics editors-in-chief']

In [None]:
# find wiki articles in certain category
# if category exists, create index with mapping (alsi need a more flexible function)
# Take the wiki articles from category and build document (create class)
# Insert document into the index


class Document:
    
    def __init__(self):
        self.title = ''
        self.page_id = None
        self.source = ''
        self.text = ''
        
    def if_exists(self, page_id, index=""):
        return client.search(index=index, 
                             body={"query": 
                                   {"match": 
                                    {"page_id": page_id}
                                   }})['hits']['total']['value']
        
    def insert(self, title, page_id, url, text, index):
        self.title=title
        self.page_id=page_id
        self.source=url
        self.text=text
        self.body = {'title': self.title,
            'page_id': self.page_id,
            'source':self.source,
            'text': self.text}
        
        if if_exists(self.page_id) == 0:
        
            try:
                client.index(index=index, body=self.body)
                print(f"Sucess! The article {self.title} was added to index {index}")
            except error:
                print("Something went wrong", error)
                
        else:
            print("This article is already in the database")


In [None]:
def simple_wiki_doc(category):
    
    if type(category) is not list: category = [ category ]

    wiki_wiki = wikipediaapi.Wikipedia('en')
    
    for c in category:

        cat = wiki_wiki.page(f"Category:{c}")

        for key in cat.categorymembers.keys():
            page = wiki_wiki.page(key)

            if not "Category:" in page.title:
                
                doc = Document()
                doc.insert(page.title, page.pageid, page.fullurl, page.text, index=slugify(c))
                
            break
                
simple_wiki_doc(categories)

# Create a new index for nested data


In [None]:
mapping = {
    "properties": {
        
            "text": {
                "type": "nested",
                "properties":{
                    "section_num": {"type":"integer"},
                    "section_title": {"type":"text"},
                    "section_content": {"type":"text"}
                }
            },
        
            "title": {
                "type": "text"
            },
        
            "source": {
                "type": "text"
            },
        
            "page_id": {
                "type": "long"
            },
            
        }
    }

## Customizing indexing  - mappings

Qary chatbot uses BERT or ALBERT model to generate answers from context.
For it to perform better we want to only give it shorter and most relevant context. Storing articles in sections may improve performance

To store wikipedia artilcles in sections, we define the text field as nested datatype:

In [None]:
categories2 = ['Machine learning',
              'Natural language processing',
              'Coronaviridae',
              '21st-century American comedians',
              'Pandemics']

In [None]:
mapping = {
    "properties": {
        
            "text": {
                "type": "nested",
                "properties":{
                    "section_num": {"type":"integer"},
                    "section_title": {"type":"text"},
                    "section_content": {"type":"text"}
                }
            },
        
            "title": {
                "type": "text"
            },
        
            "source": {
                "type": "text"
            },
        
            "page_id": {
                "type": "long"
            },
            
        }
    }


### Create some new indices with nested data structure

In [None]:
def parse_article(article):
    
    text = article.text
    section_titles = [sec.title for sec in article.sections]
    
    sections = [{'section_num': 0},
                {'section_title': "Summary"},
                {'section_content': article.summary}]
    
    for i, title in enumerate(section_titles[::-1]):
        num = len(section_titles)-i
        if len(text.split(f"\n\n{title}")) == 2:
            section_dict = {"section_num": num,
                            "section_title": title,
                            "section_content": text.split(f"\n\n{title}")[-1]}
            sections.append(section_dict)
            text = text.split(f"\n\n{title}")[0]
        else:
            pass
            
        
    return sections

In [None]:
def search_insert_wiki(category, mapping):
    
    if type(category) is not list: category = [ category ]

    wiki_wiki = wikipediaapi.Wikipedia('en')
    
    for c in category:
        
        
        '''Create and empty index with predefined data structure'''
        try:
            client.indices.create(index=slugify(c), body={"mappings":mapping})
            print(f"Index {c} has been created")
            
        except error:
            print("something went wrong", error)
            

        '''Access the list of wikipedia articles in category c'''
        cat = wiki_wiki.page(f"Category:{c}")

        for key in cat.categorymembers.keys():
            page = wiki_wiki.page(key)

            if not "Category:" in page.title:
                ''' Build a dictionary and add in to the index'''
                
                text = parse_article(page)
                doc = Document()
                doc.insert(page.title, page.pageid, page.fullurl, text, index=slugify(c))
                
search_insert_wiki(categories2, mapping)