In [33]:
import re                                            # Regular python module for regex functions.
import json                                          # Output into Json file.
import io
from wikipedia import WikipediaPage                  # Provides some attributes and function to read data from wikipedia. i.e. Titles, Summary, Context, Images.
from wikipedia import DisambiguationError, PageError # Error thrown in case of Disambiguity.

In [3]:
def normalize_passage(text_str):
    
    """ Remove some unnecessary characters from the data using regular python regex"""

    text_str = re.sub(r"(    )", '',text_str) ## Remove unwanted spaces.
    text_str = re.sub(r"(\n)", '',text_str) ## Remove newline chars.
    text_str = re.sub(r"(\\displaystyle)", '',text_str) 
    text_str = re.sub(r"(\\)", '',text_str)
    text_str = re.sub(r'(Edit ==)','',text_str)
    text_str = re.sub(r'(Edit ===)','',text_str)
    text_str = re.sub(r'(==)','',text_str)
    text_str = re.sub(r'(===)','',text_str)

    return text_str

In [55]:
def read_from_wiki(titles):
    
    """ This function will except a list(list of Strings) of all the titles 
    and we will use these strings to get test from wikipedia."""
    
    out_data_list = []
    
    for title in titles:
        
        no_error = True
        
        out_data_dict = {'Title': title ,'Passage':'', "Question": [] ,"Error" : None }  ## Will store our processed text into dictionary. {key:'Passage', value:'Text'}
        
        try:
            get_wiki_data = WikipediaPage(title = title)  ## Get all the data from wikipedia.
            
        except DisambiguationError:
            
            ## If there is any disambiguity in the Title name.
            error_str = ("There is Disambigity in the title : "+title+ ". Please provide more precise title.")
            no_error = False ## If there is any error set it False.
            #return error_str
        
        except PageError:
            
            ## If no page found with the given title.
            error_str = ("Page id "+title+" does not match any pages. Try another id!")
            no_error = False ## If there is any error set it False.
            #return error_str
        
        if no_error:
            
            content_only = get_wiki_data.content # Get only main content.
        
            processed_text = normalize_passage(content_only) ## Process text using normalize_passge function.
        
            out_data_dict['Passage'] = processed_text ## Store received text into dictionary.
        
            out_data_list.append(out_data_dict) ## Now append each dictionary into List.
            
        else : 
            
            out_data_dict['Error'] = error_str
            print (title)
            print(error_str)
        
    return out_data_list

In [56]:
def to_json(list_of_titles):
    """ Convert list of dictionary into a json file. """
    
    with io.open('./../data/wiki_text.json', 'w', encoding='utf8') as outfile:
        str_ = json.dumps(read_from_wiki(list_of_titles), indent=4, separators=(',', ': '))
        outfile.write(str_)

In [60]:
topics = ['Super Bowl 50', 'Warsaw', 'Normans', 'Nicola Tesla', 'Computational complexity theory', 'Teacher', 'Martin Luther',
     'Southern California', 'Sky (United Kingdom)', 'Victoria (Australia)', 'Huguenot', 'Steam engine', 'Oxygen',
    '1973 oil crisis', 'Apollo program', 'European Union Law', 'Amazon rainforest', 'Ctenophora','Fresno California', 
    'Packet switching', 'Black Death', 'Geology', 'Newcastle upon Tyne' , 'Victoria and Albert Museum', 'American Broadcasting Company',
    'Genghis Khan', 'Pharmacy', 'Immune system', 'Civil disobedience', 'Construction' , 'Private school', 'Harvard University',
    'Jacksonville, Florida', 'Economic inequality', 'Doctor Who', 'University of Chicago', 'Yuan dynasty', 'Kenya', 'Intergovernmental Panel on Climate Change',
    'Chloroplast', 'Prime number', 'Rhine' , 'Scottish Parliament', 'Islamism', 'Imperialism', 'United Methodist Church', 'French and Indian War',
    'Force']

In [59]:
to_json(topics)