# Parsing and denormalization of Wikipedia articles for fragmented indexing in Elasticsearch

In [1]:
# Import dependencies
from elasticsearch import Elasticsearch
import wikipediaapi
from pprint import pprint
import pandas as pd
import numpy as np
import yake
import re
from bs4 import BeautifulSoup
import json

In [2]:
wiki_wiki = wikipediaapi.Wikipedia('en')
client = Elasticsearch("http://localhost:9200")

In [3]:
client.cat.count("wikipedia", params={"format": "json"})

[{'epoch': '1588549463', 'timestamp': '23:44:23', 'count': '28077'}]

In [None]:
# Delete all indices if need to reindex
# client.indices.delete("_all")

# Data preprocessing

### Load all article titles to download from articles_to_download.txt

I already have a list of wikipedia articles (articles.txt) that I will process and index in Elasticsearch. It is a list of dictionaries with wikipedia page ID as keys and the aricle titles as values.

The wikipedia articles to be downloaded belong to the following categories:
```
categories = ['Presidents of the United States', 
              'Marvel Comics', 
              'American comics writers',
              'Marvel Comics editors-in-chief',
              'Machine learning',
              'Natural language processing',
              'Coronaviridae',
              '21st-century American comedians',
              'Pandemics']
```

In [4]:
# Read the artclie key-values pairs
with open('articles_to_download.txt') as json_file:
    articles = json.load(json_file)

### Build a list of dictionaries with section text

- Break down the articles into list of dictionaries, where each dictionary is a section. 
- Build dataframe with columns "level", "text" and "section_title"
- Create new fields "article_title", "main_section", "tags" and "subsection"
- Reconstruct all section titles for nested documents. Wikipedia-API allows to access all sections, but it does not allow easily get all levels of titles for nested articles(e.g., section--> subsection-->sub-subsection)
- Clean dataset: remove redundant columns ("level" and "subsection") and any blank sections with an empty "text" field, (some higher level sections with multiple subsections tend to be empty)

In [5]:
def deconstruct_article(page):
    section_list = []
            
    section_list = [{'level': 0,
                     'section_title': 'Summary',
                     'text': page.summary}]

    def get_sections(sections, level=0):
        for s in sections:
            section_dict = {'level':level,
                            'section_title': s.title, 
                            'text': s.text}
            section_list.append(section_dict)
            get_sections(s.sections, level + 1)

    get_sections(page.sections)
    return section_list

In [6]:
def build_documents(page, section_list):
    
    # Transform list of dictionaries to dataframe
    df = pd.DataFrame(section_list)
    
    # Create column "main_section"
    df['main_section'] = np.nan
    df.loc[df['level']==0, 'main_section'] = df['section_title']
    df['main_section'].fillna(method='ffill', inplace=True)
    
    # Create column "subsection"
    df['subsection'] = np.nan
    df.loc[df['text']=='', 'subsection'] = df['section_title']
    df['subsection'].fillna(method='ffill', inplace=True)
    
    # Add wikipedia article title, source url and page id
    df1 = df.replace(np.nan, '', regex=True)
    df1['article_title']=page.title
    df1['source_url']=page.fullurl
    df1['page_id']=page.pageid
    
    # Create a list of section tags
    df1['tags']=df1.apply(lambda row: [row['article_title'],
                                       row['main_section'], 
                                       row['subsection'], 
                                       row['section_title']],
                          axis=1)
    df1['tags']=df1['tags'].apply(lambda cell: [s for s in cell if s!=""])
    df1['tags']=df1['tags'].apply(lambda cell: list(dict.fromkeys(cell)))
    
    # Drop rows with NaN values (empty sections)
    df2 = df1.replace('', np.nan, regex=True)
    df2 = df2.drop(['level', 'subsection'], axis=1).dropna()
    
    # Transform a list of tags to a comma separated string
    df2['tags']=df2.apply(lambda row: ','.join(row['tags']),axis=1)
    
    # Add tags as the first line of the text field
    df2['text']=df2.apply(lambda row: row['tags']+'\n'+row['text'],axis=1)
    
    # Add number of section withing the article
    df2['section_number']=df2.index
    
    return df2

Build a list of dataframes. Each dataframe on the list has all the sections of an individual article.

In [7]:
article_df_list = []

for key, value in articles.items():
    try:
        page = wiki_wiki.page(value)
        sections = deconstruct_article(page)
        document_df = build_documents(page, sections)
        article_df_list.append(document_df)
    except Exception as error:
        print(f"Something went wrong when loading {page.pageid} article",error)
        pass

Something went wrong when loading -1 article 'fullurl'
Something went wrong when loading -1 article 'fullurl'
Something went wrong when loading -1 article 'fullurl'


In [None]:
# concatenate all dataframes with individual articles into a single dataframe
final_df = pd.concat(article_df_list)
final_df.shape

In [None]:
# save 
final_df.to_pickle("data/articles.pkl")

In [None]:
unpickled_df = pd.read_pickle("./articles.pkl")

In [None]:
unpickled_df.head()

### Load articles from pickle file

In [None]:
preprocessed_articles = unpickled_df.to_dict('records')

In [None]:
preprocessed_articles[0]

# Adding other fields to the documents

### Add keywords with YAKE

In [None]:
unpickled_df.head()

In [None]:
kw_extractor = yake.KeywordExtractor()
def extract_keywords(text):
    keywords = kw_extractor.extract_keywords(text)
    kw = ','.join([kw[0] for kw in keywords])
    return kw  

In [None]:
unpickled_df['keywords']=''
unpickled_df['keywords'] = unpickled_df.apply(lambda x: extract_keywords(x['text']) if len(x['text'])>1000 else "", axis=1)
unpickled_df.head()

In [None]:
unpickled_df.to_pickle("./articles_with_keywords.pkl")

### Add all wikipedia categories (optional)

In [None]:
# categores
def print_categories(page):
    category_list = []
    categories = page.categories
    for title in sorted(categories.keys()):
        if 'articles' not in str(categories[title]) and 'pages' not in str(categories[title]):
            category_list.append(str(categories[title]).split(':')[1].split('(i')[0].strip())
    return category_list

categories = print_categories(page)

### Populate ElasticSearch database

In [None]:
articles_with_kw = unpickled_df.to_dict('records')
articles_with_kw[0]

In [None]:
for item in articles_with_kw:
    try:
        client.index(index='wikipedia', body=item)
    except Exception as error:
        pageid = item['page_id']
        print(f"Something went wrong with {pageid}")
    

## Update all documents in elasticsearch

Add new field to every document

In [None]:
client.indices.get_mapping('wikipedia')

In [None]:
client.indices.put_mapping(index="wikipedia", body= {"properties": {"keywords": {"type": "text"}}})

In [None]:
query_by_id = {"size": 100, "query": {"term": {"page_id": 20966}}}

# get a response using the Search API
response = client.search(index="wikipedia", body=query_by_id)
documents = response['hits']['hits']
documents[0]

In [None]:
kw_extractor = yake.KeywordExtractor()

In [None]:
# iterate over the list of documents

source_to_update = {"_source" : {"keywords": kw}}
for num, doc_id in enumerate(documents):
    
    if len(doc_id["_source"]["text"]) > 1000:
        keywords = kw_extractor.extract_keywords(doc_id["_source"]["text"])
        kw = ','.join([kw[0] for kw in keywords])
        title = doc_id["_source"]["section_title"]
        l = len(doc_id["_source"]["text"])
        print(f"Result {num}: {title} of length {l}")
        print(type(kw))
    
    # catch API errors
        try:
            # call the Update method
            response = client.update(index="wikipedia", doc_type="_doc", id=doc_id["_id"], body=source_to_update)
            print("success!")
        except Exception as error:
            print("something went wrong", error)

Find keywords with YAKE
-----------------------------------

In [None]:
def add_keywords(article_sections):
    for s in article_sections:
        keywords = kw_extractor.extract_keywords(s['text'])
        kw = ','.join([kw[0] for kw in keywords])
        s['keywords']=kw
    return article_sections

sections_with_kw = add_keywords(article_sections)

In [None]:
len(sections_with_kw)
pprint(sections_with_kw)

In [None]:
for s in article_sections:
    keywords = kw_extractor.extract_keywords(s['text'])
    kw = ','.join([kw[0] for kw in keywords])
    s['keywords']=kw

In [None]:
pprint(article_sections[0].keys())

## Simple search query

In [None]:
question = "When Turing test was proposed?"

body = {    
    "query": {
        "bool" : {
          "must_not" : [
            {"term" : { "section_title" : "lists" }},
            {"term" : { "section_title" : "links" }},
            {"term" : { "section_title" : "other" }},
            {"term" : { "section_title" : "see also" }},
            {"term" : { "section_title" : "bibliography" }},
            {"term" : { "section_title" : "references" }},
            {"term" : { "section_title" : "official" }},
            {"match" : { "tags" : "see also" }},
              
          ],
          "should": [
              {"multi_match" : {"query":question, 
                                "fields": [ "keywords^3", "text" ] }},
#               {"range" : {
#                 "section_number" : {
#                 "gte" : 0,
#                 "lte" : 3,
#                 "boost" : 5
#             }}}
          ]
        }
      },
    "highlight": {
        "fields": {
            "text": {"number_of_fragments": 3, 'order': "score", "fragment_size": 512}
            }
        }
    
}
docs = client.search(body, index="")

In [None]:
print(f"Question: {question}")
print("")
print("Search results:")
print("----------------------")

for i, doc in enumerate(docs["hits"]["hits"]):
    title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    snippet = doc['highlight']['text'][0]
    snippet_soup = BeautifulSoup(snippet)
    print(f'Result {i}: {title} | {section_title} | Relevance score {score}')
    print(snippet_soup.get_text())
    print("")

*string_query* returns documents based on a provided query string, using a parser with a strict syntax.

In [None]:
question = "Barack Obama AND Michelle Obama"

body = {
    "query": {
        "query_string" : {
            "query" : question,
            "default_field" : "text"
        }
    },
    "highlight": {
        "fields": {
            "text": {"number_of_fragments": 3, 'order': "score", "fragment_size": 512}
        }
    }
}
docs = client.search(body, index="")      

In [None]:
print(f"Question: {question}")
print("")
print("Search results:")
print("----------------------")

for i, doc in enumerate(docs["hits"]["hits"]):
    title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    snippet = doc['highlight']['text'][0]
    snippet_soup = BeautifulSoup(snippet)
    print(f'Result {i}: {title} | {section_title} | Relevance score {score}')
    print(snippet_soup.get_text())
    print("")

Returns documents based on a provided query string, using a parser with a limited but fault-tolerant syntax.

In [None]:
question = "Who is 'Stan Lee'?"

body = {
  "query": {
    "simple_query_string" : {
        "query": question,
        "fields": ["keywords^5", "text"],
#         "default_operator": "and",
        "auto_generate_synonyms_phrase_query" : False
    }
  },
    "highlight": {
        "fields": {
            "text": {"number_of_fragments": 3, 'order': "score", "fragment_size": 512}
        }
    }
}
docs = client.search(body, index="") 

In [None]:
print(f"Question: {question}")
print("")
print("Search results:")
print("----------------------")

for i, doc in enumerate(docs["hits"]["hits"]):
    title = doc['_source']['article_title']
    section_title = doc['_source']['section_title']
    score = doc['_score']
    snippet = doc['highlight']['text'][0]
    snippet_soup = BeautifulSoup(snippet)
    print(f'Result {i}: {title} | {section_title} | Relevance score {score}')
    print(snippet_soup.get_text())
    print("")