# Description
This notebook is a collection of code snippets to demonstrate wikipedia article accesses
without full decompression. Done by making use of the multistream file format of the downloaded data
dumps.

# Imports

In [1]:
import bz2
import sys
import bisect
import Levenshtein
import os.path as path
from functools import lru_cache
from tqdm.notebook import tqdm_notebook

# Constants

In [2]:
DATA_DIR = '../data'

# Dump Filename Validation

In [3]:
# Validate all compressed files are present data directory
def validate_dump(data_dir, date='2021-10-20') -> bool:
    # Check for existence of data directory
    if not path.exists(dump_dir:=path.join(data_dir, date)):
        print(f'{data_dir} does not exist!')
        return False
    
    # Check for the existence of index and multistream data files
    fname_base = f'enwiki-{date.replace("-", "")}-pages-articles-multistream'
    for fname_suffix in {'.xml', '-index.txt'}:
        if not path.exists(fpath:=path.join(dump_dir, f'{fname_base}{fname_suffix}.bz2')):
            print(f'{fpath} does not exist!')
            return False
        
    # Files exist
    return True

# Validate 
validate_dump(DATA_DIR)

True

# Experimentation

In [4]:
# Loading index for files
title2ids = {}
id2title = {}
id2chunk = {}
chunk2len = {}
fpath = path.join(DATA_DIR, '2021-10-20', 'enwiki-20211020-pages-articles-multistream-index.txt.bz2')
with bz2.open(fpath, 'rt', encoding='utf-8') as index_file:
    last_offset = None
    for line in index_file.readlines():
        # Split on the first 2 colons
        vals = line.strip().split(':')
        offset, art_id, title = int(vals[0]), int(vals[1]), ':'.join(vals[2:])
    
        # Set on offset change, compute chunk length
        if offset != last_offset:
            if last_offset:
                chunk2len[offset] = offset - last_offset
            last_offset = offset
        if offset not in chunk2len:
            chunk2len[offset] = None
    
        # Populate title2ids dictionary
        if title not in title2ids:
            title2ids[title] = set()
        title2ids[title].add(art_id)
        
        # Populate other mappings
        id2title[art_id] = title
        id2chunk[art_id] = offset

In [7]:
# Return the top k closest article titles to query
@lru_cache(maxsize=20)
def search(query:str, k:int=20) -> list[str]:
    # Validate k is less then article count
    if k > len(title2ids): raise ValueError(f'k > article_count, {k} < {len(title2ids)}')
    if k < 0: raise ValueError(f'k < 0, {k} < 0')
    
    # Search for the top k closest article titles
    res = [(sys.maxsize, None)] * k
    for title in tqdm_notebook(title2ids, desc=f'Searching for {query}'):
        # Compute distance to query
        dist = Levenshtein.distance(query.lower(), title.lower())
        
        # Insert into result list if in top k
        if dist < res[-1][0]:
            ins_idx = bisect.bisect_left(res, dist, key=lambda x: x[0])
            res.insert(ins_idx, (dist, title))
            res = res[:-1]
    
    # Return top k most similar titles
    return [title for _, title in res]

In [8]:
search('Poland')

Searching for Poland:   0%|          | 0/21577561 [00:00<?, ?it/s]

['PolanD',
 'POLAND',
 'Poland',
 'Polad',
 'Toland',
 'Polana',
 'Polani',
 'Polany',
 'Poland.',
 'Polnd',
 'Oland',
 'Holand',
 'Moland',
 'Doland',
 'Voland',
 'Noland',
 'RoLand',
 'Woland',
 'Boland',
 'Polans']

In [10]:
search('Tower of Pizza')

Searching for Tower of Pizza:   0%|          | 0/21577561 [00:00<?, ?it/s]

['Tower of pisa',
 'Tower of Pisa',
 'Tower of Pimps',
 'Tower of Ra',
 'Tower of Lilaia',
 'Tower of Lire',
 'Tower of Bera',
 'Tower of Light',
 'Tower of Ramla',
 'Tower of Mirian',
 'Tower of Muda',
 'Tower of Spite',
 'Tower of Nona',
 'Tower Of Power',
 'Tower of Power',
 'Tower of Siloam',
 'Tower of Dubai',
 'Tower Of Babel',
 'Power of Shazam',
 'Tower of druaga']

In [25]:
wiki_fpath = path.join(DATA_DIR, '2021-10-20', 'enwiki-20211020-pages-articles-multistream.xml.bz2')
def decompress_chunk(article_id):
    decomp = bz2.BZ2Decompressor()
    with open(wiki_fpath, 'rb') as wiki_file:
        wiki_file.seek(id2chunk[article_id])
        data = wiki_file.read(chunk2len[id2chunk[article_id]])
        return decomp.decompress(data).decode('utf-8')

In [28]:
# Getting the chunk containing the top article for a search
article_id = list(title2ids[search('Poland', k=1)[0]])[0]
print(decompress_chunk(article_id))

  <page>
    <title>Psychology</title>
    <ns>0</ns>
    <id>22921</id>
    <revision>
      <id>1050766379</id>
      <parentid>1050655704</parentid>
      <timestamp>2021-10-19T19:40:14Z</timestamp>
      <contributor>
        <username>Iss246</username>
        <id>1357713</id>
      </contributor>
      <comment>Edit</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text bytes="202981" xml:space="preserve">{{Short description|Study of mental functions and behaviours}}
{{distinguish|Phycology|Physiology|Psychiatry}}
{{about||the album|Psychology (album)|the short story|Psychology (short story)}}
{{redirect|Psychological|the Pet Shop Boys song|Psychological (song)}}
{{pp-semi-indef|small=yes}}
{{pp-move-indef}}
{{Use dmy dates|date=May 2021}}
{{Psychology sidebar|all}}
&lt;!--Overview of discipline:--&gt;'''Psychology''' is the [[science]] of [[mind]] and [[behavior]]. Psychology includes the study of [[consciousness|conscious]] and [[Unconscious mind