In [1]:
import os

import pandas as pd

## read data

### parsed texts from wiki raw html pages

In [3]:
wiki_dp = '/media/rtn/Windows 10/work/univier/wiki_extract/wiki_parsed'
fp = os.path.join(wiki_dp, 'filepaths.csv')
filemap = pd.read_csv(fp)
print(filemap.shape)

(223619, 3)


### domain data to narrow search
* 13105 / 15190 (or 86%) articles are found in our wiki extract. 
* check what articles are missed

In [174]:
domain_articles = pd.read_csv('data/selected_docs.tsv', sep='\t', header=None).iloc[:, 0]
domain_articles = domain_articles.str.lower().str.strip()
domain_articles = domain_articles.str.replace(r"[:,!#'\(\);\.\/\-–]", '_', regex=True)
domain_articles.shape

(15190,)

In [175]:
len(set(domain_articles).intersection(filemap['filename']))

13105

In [176]:
not_matched = pd.DataFrame({'title': list(set(domain_articles).difference(filemap['filename']))})
not_matched.shape

(2082, 1)

In [177]:
not_matched['is_alphanum'] = not_matched['title'].str.fullmatch(r'[a-z0-9_]+')
not_matched['is_alphanum'].value_counts()

True     1802
False     280
Name: is_alphanum, dtype: int64

In [178]:
not_matched.loc[~not_matched['is_alphanum'], 'title'].sample(10).tolist()

['university_nuhelotʼįne_thaiyotsʼį_nistameyimâkanak_blue_quills',
 'thalía',
 'manú_national_park',
 'musée_d_orsay',
 'toys_"r"_us',
 'coupé',
 'düsseldorf_government_region',
 'karl_wilhelm_von_nägeli',
 'eats__shoots_&_leaves',
 'mø']

In [179]:
not_matched.loc[not_matched['is_alphanum'], 'title'].sample(10).tolist()

['flora__given_name_',
 'new_year_s_evil',
 'cinnamon_toast_crunch',
 '1990_fifa_world_cup_qualification___uefa_group_4',
 'list_of_emmerdale_characters__2017_',
 'haldighati_pass',
 'emir_uyar',
 'velioglu_s_chub',
 'last_characters_in_waterloo_road__7__8__9__10_',
 'list_of_u_s__cities_named_after_their_state']

In [180]:
filemap[filemap['filename'].str.startswith('cinnamon')]

Unnamed: 0,filename,path,html_path
44429,cinnamon,/media/rtn/Windows 10/work/univier/wiki_extrac...,/media/rtn/Windows 10/work/univier/wiki_extrac...


In [133]:
filemap[filemap['filename'].str.contains("'")]

Unnamed: 0,filename,path,html_path


### OK, let's proceed with what we have

In [182]:
domain_articles.shape

(15190,)

In [191]:
len(set(domain_articles).intersection(filemap['filename']))

13105

In [190]:
filemap = filemap[filemap['filename'].isin(domain_articles)].copy()
print(filemap.shape)

(13105, 3)


## functions

In [193]:
def get_article_path(title):
    return filemap.query('filename == @title.lower().strip()')['path'].iloc[0]

def get_article_text(fp):
    with open(fp) as fin: text = fin.read()
    return text

In [205]:
title = filemap.sample()['filename'].iloc[0]
print(f'title: "{title}"')
get_article_text(get_article_path(title))

title: "containment"


'The policy of containment was a diplomatic strategy of the United States to prevent the spread of the communism during the Cold War. Economic support should bind countries to the United States. This was a counteraction against the new Soviet sphere of influence. In March 1947 the US president Harry Truman (1884-1972) announced support for countries which were threatened by military force. This speech is called "Truman Doctrine" nowadays. In detail, he promised support for Greece and Turkey against the Soviet Union. The most important part of the policy of containment was the "European Recovery Program" (1948), also known as "Marshall Plan", which should boost the European economy after the destruction in the Second World War using—most important—money, successful American economic models, and less European trade barriers.'

In [207]:
get_article_text(get_article_path('bikont'))

' A bikont ("two flagellae") is any eukaryotic organism in the group Bikonta. Many single-celled members of the group, and the presumed ancestor, have two flagella. Another shared trait of bikonts is the fusion of two genes into a single unit. The genes for thymidylate synthase (TS) and dihydrofolate reductase (DHFR) code for a single protein with two functions. The genes are separately translated in unikonts.'