Reference: https://github.com/daveshap/PlainTextWikipedia

I tweaked the above parser to extract only articles that mention 'Nigeria' instead of all the articles in the XML file and to save each article it extracts to a CSV file.

In [None]:
import os
from threading import Thread
import csv
import re
from html2text import html2text as htt
import wikitextparser as wtp


def dewiki(text):
    text = wtp.parse(text).plain_text()  # wiki to plaintext 
    text = htt(text)  # remove any HTML
    text = text.replace('\\n',' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    return text


def analyze_chunk(text):
    try:
        if '<redirect title="' in text:  # this is not the main article
            return None
        if '(disambiguation)' in text:  # this is not an article
            return None
        else:
            checkFor = ['Nigeria', 'nigeria']
            if any(x in text for x in checkFor):
                title = text.split('<title>')[1].split('</title>')[0]
                title = htt(title)
            else:
                return None
            # if ':' in title:  # most articles with : in them are not articles we care about
            #     return None
        serial = text.split('<id>')[1].split('</id>')[0]
        content = text.split('</text')[0].split('<text')[1].split('>', maxsplit=1)[1]
        content = dewiki(content)
        return {'title': title.strip(), 'text': content.strip(), 'id': serial.strip()}
        
    except Exception as oops:
        print(oops)
        return None


def save_article(article, savedir):
    doc = analyze_chunk(article)
    if doc:
        #print('SAVING:', doc['title'])
        print('SAVING:', doc['title'])
        filename = doc['title'] 
        headers = ["id", "title", "text"]
        rows = [doc['id'], doc['title'], doc['text']]
        completeName = os.path.join(savedir, filename + ".csv")
        with open(completeName, 'w', encoding='utf-8') as outfile:
            csvwriter = csv.writer(outfile)
            csvwriter.writerow(headers)
            csvwriter.writerow(rows)
            
            #json.dump(doc, outfile, sort_keys=True, indent=1, ensure_ascii=False)  #discarded

def process_file_text(filename, savedir):
    article = ''
    with open(filename, 'r', encoding='utf-8') as infile:
        for line in infile:
            if '<page>' in line:
                article = ''
            elif '</page>' in line:  # end of article
                Thread(target=save_article, args=(article, savedir)).start()
            else:
                article += line

In [None]:
from dewiki_functions import *

wiki_xml_file = r"/home/admin1/wikidump/enwiki-20211120-pages-articles-multistream.xml" #my file location
csv_save_dir = r"/home/admin1/NigerianArticles"

if __name__ == '__main__':
    process_file_text(wiki_xml_file, csv_save_dir)