# Parsing and denormalization of Wikipedia articles for fragmented indexing in Elasticsearch

 TODO:
 
 - Add categories (DONE)
 - Add links (MAYBE NOT)
 - Add keywords (DONE)

In [1]:
# Import dependencies
from elasticsearch import Elasticsearch
import wikipediaapi
from slugify import slugify
from pprint import pprint
import pandas as pd
import numpy as np
import yake
import re

### Load general information about an article (title, url, categories, etc)

In [2]:
wiki_wiki = wikipediaapi.Wikipedia('en')
page = wiki_wiki.page('Barack Obama')

url = page.fullurl
title=page.title
page_id=page.pageid
summary=page.summary

In [3]:
# categores
def print_categories(page):
    category_list = []
    categories = page.categories
    for title in sorted(categories.keys()):
        if 'articles' not in str(categories[title]) and 'pages' not in str(categories[title]):
            category_list.append(str(categories[title]).split(':')[1].split('(i')[0].strip())
    return category_list

categories = print_categories(page)

In [4]:
# links
def print_links(page):
        links = page.links
        for title in sorted(links.keys()):
            print("%s: %s" % (title, links[title]))

print_links(page)

14th Dalai Lama: 14th Dalai Lama (id: ??, ns: 0)
1828 United States presidential election: 1828 United States presidential election (id: ??, ns: 0)
1832 Democratic National Convention: 1832 Democratic National Convention (id: ??, ns: 0)
1835 Democratic National Convention: 1835 Democratic National Convention (id: ??, ns: 0)
1840 Democratic National Convention: 1840 Democratic National Convention (id: ??, ns: 0)
1844 Democratic National Convention: 1844 Democratic National Convention (id: ??, ns: 0)
1848 Democratic National Convention: 1848 Democratic National Convention (id: ??, ns: 0)
1852 Democratic National Convention: 1852 Democratic National Convention (id: ??, ns: 0)
1856 Democratic National Convention: 1856 Democratic National Convention (id: ??, ns: 0)
1860 Democratic National Conventions: 1860 Democratic National Conventions (id: ??, ns: 0)
1864 Democratic National Convention: 1864 Democratic National Convention (id: ??, ns: 0)
1868 Democratic National Convention: 1868 Democra

You (Time Person of the Year): You (Time Person of the Year) (id: ??, ns: 0)
You didn't build that: You didn't build that (id: ??, ns: 0)
YouTube: YouTube (id: ??, ns: 0)
Young Democrats of America: Young Democrats of America (id: ??, ns: 0)
Yuri Andropov: Yuri Andropov (id: ??, ns: 0)
Zachary Taylor: Zachary Taylor (id: ??, ns: 0)
Zionism: Zionism (id: ??, ns: 0)
Élie Ducommun: Élie Ducommun (id: ??, ns: 0)
Óscar Arias: Óscar Arias (id: ??, ns: 0)


In [5]:
section_list = [{'level': 0,
                 'section_title': 'Summary',
                'text': page.summary}]

### Build a list of dictionaries with section text

In [6]:
def print_sections(sections, level=0):
    for s in sections:
        print(level, s.title)
        section_dict = {'level':level,
                        'section_title': s.title, 
                        'text': s.text}
        section_list.append(section_dict)
        print_sections(s.sections, level + 1)
            
print_sections(page.sections)

0 Early life and career
1 Education
1 Family and personal life
2 Religious views
1 Law career
2 Community organizer and Harvard Law School
2 Chicago Law School and civil rights attorney
1 Legislative career
2 Illinois State Senator (1997–2004)
2 2004 U.S. Senate campaign
2 U.S. Senator from Illinois (2005–08)
3 Legislation
3 Committees
0 Presidential campaigns
1 2008
1 2012
0 Presidency (2009–2017)
1 First 100 days
1 Domestic policy
2 LGBT rights
2 White House advisory and oversight groups
2 Economic policy
2 Environmental policy
2 Health care reform
2 Energy policy
2 Gun control
2 2010 midterm elections
2 Cybersecurity and Internet policy
2 Government mass surveillance
1 Foreign policy
2 War in Iraq
2 War in Afghanistan
2 Israel
2 Libya
2 Syrian Civil War
2 Death of Osama bin Laden
2 Iran nuclear talks
2 Relations with Cuba
2 Africa
2 Hiroshima speech
2 Russia
1 Cultural and political image
0 Post-presidency (2017–present)
0 Legacy
1 Presidential library
0 Bibliography
1 Audiobooks
0 

In [7]:
df = pd.DataFrame(section_list)

In [8]:
df['main_section'] = np.nan

In [9]:
# Create column "main_section"
df.loc[df['level']==0, 'main_section'] = df['section_title']
df['main_section'].fillna(method='ffill', inplace=True)
df['subsection'] = np.nan
# Create column "subsection"
df.loc[df['text']=='', 'subsection'] = df['section_title']
df['subsection'].fillna(method='ffill', inplace=True)

In [10]:
df1 = df.replace(np.nan, '', regex=True)
df1['article_title']=title
df1['source_url']=url
df1['page_id']=page_id
df1

Unnamed: 0,level,section_title,text,main_section,subsection,article_title,source_url,page_id
0,0,Summary,Barack Hussein Obama II ( (listen); born Augus...,Summary,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
1,0,Early life and career,"Obama was born on August 4, 1961, at Kapiolani...",Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
2,1,Education,Obama started in St. Francis Pre-Education fro...,Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
3,1,Family and personal life,"In a 2006 interview, Obama highlighted the div...",Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
4,2,Religious views,Obama is a Protestant Christian whose religiou...,Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
5,1,Law career,,Early life and career,Law career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
6,2,Community organizer and Harvard Law School,"Two years after graduating from Columbia, Obam...",Early life and career,Law career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
7,2,Chicago Law School and civil rights attorney,"In 1991, Obama accepted a two-year position as...",Early life and career,Law career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
8,1,Legislative career,,Early life and career,Legislative career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366
9,2,Illinois State Senator (1997–2004),Obama was elected to the Illinois Senate in 19...,Early life and career,Legislative career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366


In [11]:
# Create list of tags
df1['tags']=df1.apply(lambda row: [title, row['main_section'], row['subsection'], row['section_title']],axis=1)
df1['tags']=df1['tags'].apply(lambda cell: [s for s in cell if s!=""])
df1['tags']=df1['tags'].apply(lambda cell: list(dict.fromkeys(cell)))

In [12]:
df1.head()

Unnamed: 0,level,section_title,text,main_section,subsection,article_title,source_url,page_id,tags
0,0,Summary,Barack Hussein Obama II ( (listen); born Augus...,Summary,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Summary]"
1,0,Early life and career,"Obama was born on August 4, 1961, at Kapiolani...",Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career]"
2,1,Education,Obama started in St. Francis Pre-Education fro...,Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career, Education]"
3,1,Family and personal life,"In a 2006 interview, Obama highlighted the div...",Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career, Family a..."
4,2,Religious views,Obama is a Protestant Christian whose religiou...,Early life and career,,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career, Religiou..."


In [13]:
df1 = df1.replace('', np.nan, regex=True)
df1 = df1.drop(['level', 'subsection'], axis=1).dropna()

In [14]:
df1.head()

Unnamed: 0,section_title,text,main_section,article_title,source_url,page_id,tags
0,Summary,Barack Hussein Obama II ( (listen); born Augus...,Summary,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Summary]"
1,Early life and career,"Obama was born on August 4, 1961, at Kapiolani...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career]"
2,Education,Obama started in St. Francis Pre-Education fro...,Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career, Education]"
3,Family and personal life,"In a 2006 interview, Obama highlighted the div...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career, Family a..."
4,Religious views,Obama is a Protestant Christian whose religiou...,Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"[Barack Obama, Early life and career, Religiou..."


In [15]:
df1['tags']=df1.apply(lambda row: ','.join(row['tags']),axis=1)

In [16]:
df1

Unnamed: 0,section_title,text,main_section,article_title,source_url,page_id,tags
0,Summary,Barack Hussein Obama II ( (listen); born Augus...,Summary,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Summary"
1,Early life and career,"Obama was born on August 4, 1961, at Kapiolani...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career"
2,Education,Obama started in St. Francis Pre-Education fro...,Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Education"
3,Family and personal life,"In a 2006 interview, Obama highlighted the div...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Family and ..."
4,Religious views,Obama is a Protestant Christian whose religiou...,Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Religious v..."
6,Community organizer and Harvard Law School,"Two years after graduating from Columbia, Obam...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Law career,..."
7,Chicago Law School and civil rights attorney,"In 1991, Obama accepted a two-year position as...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Law career,..."
9,Illinois State Senator (1997–2004),Obama was elected to the Illinois Senate in 19...,Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Legislative..."
10,2004 U.S. Senate campaign,"In May 2002, Obama commissioned a poll to asse...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Legislative..."
11,U.S. Senator from Illinois (2005–08),"Obama was sworn in as a senator on January 3, ...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Legislative..."


In [17]:
df1['text']=df1.apply(lambda row: row['tags']+'\n'+row['text'],axis=1)

In [18]:
df1['section_number']=df1.index
df1.head()

Unnamed: 0,section_title,text,main_section,article_title,source_url,page_id,tags,section_number
0,Summary,"Barack Obama,Summary\nBarack Hussein Obama II ...",Summary,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Summary",0
1,Early life and career,"Barack Obama,Early life and career\nObama was ...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career",1
2,Education,"Barack Obama,Early life and career,Education\n...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Education",2
3,Family and personal life,"Barack Obama,Early life and career,Family and ...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Family and ...",3
4,Religious views,"Barack Obama,Early life and career,Religious v...",Early life and career,Barack Obama,https://en.wikipedia.org/wiki/Barack_Obama,534366,"Barack Obama,Early life and career,Religious v...",4


In [19]:
article_sections = df1.to_dict('records')
pprint(article_sections)

[{'article_title': 'Barack Obama',
  'main_section': 'Summary',
  'page_id': 534366,
  'section_number': 0,
  'section_title': 'Summary',
  'source_url': 'https://en.wikipedia.org/wiki/Barack_Obama',
  'tags': 'Barack Obama,Summary',
  'text': 'Barack Obama,Summary\n'
          'Barack Hussein Obama II ( (listen); born August 4, 1961) is an '
          'American politician and attorney who served as the 44th president '
          'of the United States from 2009 to 2017. A member of the Democratic '
          'Party, Barack Obama was the first African-American president of the '
          'United States. He previously served as a U.S. senator from Illinois '
          'from 2005 to 2008 and an Illinois state senator from 1997 to 2004.\n'
          'Obama was born in Honolulu, Hawaii. After graduating from Columbia '
          'University in 1983, he worked as a community organizer in Chicago. '
          'In 1988, he enrolled in Harvard Law School, where he was the first '
          'bl

          'something wrong." Obama acknowledged his youthful image in an '
          'October 2007 campaign speech, saying: "I wouldn\'t be here if, time '
          'and again, the torch had not been passed to a new generation."Obama '
          'is frequently referred to as an exceptional orator. During his '
          'pre-inauguration transition period and continuing into his '
          'presidency, Obama delivered a series of weekly Internet video '
          'addresses. In his speeches as president, Obama did not make more '
          'overt references to race relations than his predecessors, but '
          'according to one study, he implemented stronger policy action on '
          'behalf of African-Americans than any president since the Nixon '
          'era.\n'
          '\n'
          'According to the Gallup Organization, Obama began his presidency '
          'with a 68% approval rating before gradually declining for the rest '
          'of the year, and eventually bo

In [20]:
len(article_sections)

49

Find keywords with YAKE
-----------------------------------

In [21]:
kw_extractor = yake.KeywordExtractor()
keywords = kw_extractor.extract_keywords(page.text)

for kw in keywords:
    print(kw)

('obama', 0.00012440667320019703)
('barack obama', 0.00022069590405960775)
('united states', 0.0002824607447249831)
('president barack obama', 0.0003724295196556915)
('barack hussein obama', 0.0004934914707157536)
('barack obama presidential', 0.000564371559583132)
('president obama', 0.0008834002272245451)
('obama administration', 0.0009201354689969861)
('united states house', 0.0010302525720064836)
('obama announced', 0.001209228520372226)
('obama signed', 0.001235705295424485)
('united states congress', 0.0015241759885816844)
('chicago law school', 0.00154392061663592)
('affordable care act', 0.0016564722972312444)
('president', 0.0017586269402226399)
('act', 0.001937055378598906)
('president obama announced', 0.0019505429522317943)
('president obama signed', 0.0019953673268444504)
('states', 0.002242018622976581)
('united', 0.002521605230718859)


In [28]:
for s in article_sections:
    keywords = kw_extractor.extract_keywords(s['text'])
    kw = ','.join([kw[0] for kw in keywords])
    s['keywords']=kw

In [30]:
pprint(article_sections)

[{'article_title': 'Barack Obama',
  'keywords': 'barack hussein obama,united states,affordable care act,harvard '
              'law school,chicago law school,obama,illinois state '
              'senator,barack obama,summary,barack hussein,harvard law '
              'review,act,law school,united states elections,care act,harvard '
              'law,states,consumer protection act,barack obama,united,taxpayer '
              'relief acts',
  'main_section': 'Summary',
  'page_id': 534366,
  'section_number': 0,
  'section_title': 'Summary',
  'source_url': 'https://en.wikipedia.org/wiki/Barack_Obama',
  'tags': 'Barack Obama,Summary',
  'text': 'Barack Obama,Summary\n'
          'Barack Hussein Obama II ( (listen); born August 4, 1961) is an '
          'American politician and attorney who served as the 44th president '
          'of the United States from 2009 to 2017. A member of the Democratic '
          'Party, Barack Obama was the first African-American president of the '
    

  'main_section': 'Presidency (2009–2017)',
  'page_id': 534366,
  'section_number': 33,
  'section_title': 'Israel',
  'source_url': 'https://en.wikipedia.org/wiki/Barack_Obama',
  'tags': 'Barack Obama,Presidency (2009–2017),Israel',
  'text': 'Barack Obama,Presidency (2009–2017),Israel\n'
          'In 2011, the United States vetoed a Security Council resolution '
          'condemning Israeli settlements, with the United States being the '
          'only nation to do so. Obama supports the two-state solution to the '
          'Arab–Israeli conflict based on the 1967 borders with land swaps.In '
          'June 2011, Obama said the bond between the United States and Israel '
          'is "unbreakable." During the initial years of the Obama '
          'administration, the U.S. increased military cooperation with '
          'Israel, including increased military aid, re-establishment of the '
          'U.S.-Israeli Joint Political Military Group and the Defense Policy '
         

Create a connection instance:

In [None]:
client = Elasticsearch("http://localhost:9200")

In [None]:
# Check currently available indices
indices = client.indices.get_alias("_all")
index_lst = list(indices.keys())

In [None]:
index_lst

In [None]:
# Count documents in multiple indices
client.cat.count(['wikipedia'], params={"format": "json"})

In [None]:
client.indices.delete("wikipedia")
client.indices.get_alias("_all")

In [None]:
# Count documents in all indices
client.cat.count("_all", params={"format": "json"})

In [None]:
# Check an index mapping - default
pprint(client.indices.get_mapping("coronaviridae"))

In [None]:
class Document:
    
    def __init__(self):
        self.title = ''
        self.page_id = None
        self.source = ''
        self.text = ''
        
    def __if_exists(self, page_id, index=""):
        '''
        A private method to check if the article already exists in the database
        with a goal to avoid duplication
        '''
        
        return client.search(index=index, 
                             body={"query": 
                                   {"match": 
                                    {"page_id": page_id}
                                   }})['hits']['total']['value']
        
    def insert(self, title, page_id, url, text, references, index):
        ''' Add a new document to the index'''
        
        self.title=title
        self.page_id=page_id
        self.source=url
        self.text=text
        self.references = references
        self.body = {'title': self.title,
            'page_id': self.page_id,
            'source':self.source,
            'text': self.text,
            'references':self.references}
        
        if self.__if_exists(page_id) == 0:
        
            try:
                client.index(index=index, body=self.body)
#                 print(f"Sucess! The article {self.title} was added to index {index}")
            except error:
                print("Something went wrong", error)
                
        else:
            print(f"Article {self.title} is already in the database")

In [None]:
mapping = {
    "properties": {
        
            "text": {
                "type": "nested",
                "properties":{
                    "section_num": {"type":"integer"},
                    "section_title": {"type":"text"},
                    "section_content": {"type":"text"}
                }
            },
        
            "references": {
                "type": "nested",
                "properties":{
                    "section_num": {"type":"integer"},
                    "section_title": {"type":"text"},
                    "section_content": {"type":"text"}
                }
            },
        
            "title": {
                "type": "text"
            },
        
            "source": {
                "type": "text"
            },
        
            "page_id": {
                "type": "long"
            },
            
        }
    }

### Create new indices with nested data structure

Some wikipedia articles in categories are large and may have multiple levels of subsections. We chose to parse the data from full text instead of drawing sections and subsections from wikipedia API to achieve uniform depth of nested dictionaries within a single index.

In [None]:
wiki_wiki = wikipediaapi.Wikipedia('en')
page = wiki_wiki.page('Artificial intelligence')
page.summary

In [None]:
def parse_article(article):
    ''' Parce wikipedia articles from the full article text'''
    
    text = article.text
    # get section titles for the existing sections
    section_titles = [sec.title for sec in article.sections]
    
    # initiate the sections dictionary with a summary (0th section) 
    sections = [{'section_num': 0,
                'section_title': "Summary",
                'section_content': article.summary}]
    
    for i, title in enumerate(section_titles[::-1]):

        num = len(section_titles)-i
        if len(text.split(f"\n\n{title}")) == 2:
            section_dict = {"section_num": num,
                            "section_title": title,
                            "section_content": text.split(f"\n\n{title}")[-1]}
            sections.append(section_dict)
            text = text.split(f"\n\n{title}")[0]
        else:
            pass
            
        
    return sections

In [None]:
def get_references(mylist):
    
    reference_list = []
    content_list = []
    
    for d in mylist:
        if d['section_title'].lower() in ' '.join(['see also references external links bibliography notes']):
            reference_list.append(d)
        else:
            content_list.append(d)
            
    return (content_list, reference_list)  

In [None]:
parsed_article = parse_article(page)

In [None]:
content, references = get_references(parsed_article)

In [None]:
pprint(content)

In [None]:
def search_insert_wiki(category, mapping):
    
    if type(category) is not list: category = [ category ]

    wiki_wiki = wikipediaapi.Wikipedia('en')
    
    for c in category:
        
        try:
                    
            '''Create and empty index with predefined data structure'''
            client.indices.create(index=slugify(c), body={"mappings":mapping})
            
            '''Access the list of wikipedia articles in category c'''
            cat = wiki_wiki.page(f"Category:{c}")
            
            ''' Parse and add articles in the category to database'''
            for key in cat.categorymembers.keys():
                page = wiki_wiki.page(key)

                if not "Category:" in page.title:

                    text = parse_article(page)
                    content, references = get_references(text)
                    doc = Document()
                    doc.insert(page.title, page.pageid, page.fullurl, content, references, index=slugify(c))


        except Exception as error:
            '''Skip category if it alredy exists in indices'''
            print(f"The following exception occured while trying to create index '{slugify(c)}': ", error)
            
            
search_insert_wiki('Machine learning', mapping)

In [None]:
categories = ['Presidents of the United States', 
              'Marvel Comics', 
              'American comics writers',
              'Marvel Comics editors-in-chief',
              'Machine learning',
              'Natural language processing',
              'Coronaviridae',
              '21st-century American comedians',
              'Pandemics',
              'Artificial intelligence'
             ]

In [None]:
client.indices.get_alias("_all")