In [1]:
import os

import pandas as pd

## read data

In [2]:
def process_title(title: pd.Series):
    title = title.str.lower().str.strip()
    title = title.str.replace(r'([^\w\s])|_|-', '_', regex=True)
    return title

# def process_title2(title: pd.Series):
#     title = title.str.lower().str.strip()
#     title = title.str.replace(r"[:,!#'\(\);\.\/\-–]", '_', regex=True)
#     return title

### parsed texts from wiki raw html pages

In [3]:
wiki_dp = '/media/rtn/Windows 10/work/univier/wiki_extract/wiki_parsed'
fp = os.path.join(wiki_dp, 'filepaths.csv')
filemap = pd.read_csv(fp)
print(filemap.shape)

(223619, 3)


### domain data to narrow search
* 13112 / 15190 (or 86%) articles are found in our wiki extract. 
* check what articles are missed

In [4]:
domain_articles = pd.read_csv('data/selected_docs.tsv', sep='\t', header=None).iloc[:, 0]
domain_articles = process_title(domain_articles)
domain_articles.shape

(15190,)

In [5]:
len(set(domain_articles).intersection(filemap['filename']))

13112

In [6]:
not_matched = pd.DataFrame({'title': list(set(domain_articles).difference(filemap['filename']))})
not_matched.shape

(2075, 1)

In [7]:
not_matched['is_alphanum'] = not_matched['title'].str.fullmatch(r'[a-z0-9_]+')
not_matched['is_alphanum'].value_counts()

True     1842
False     233
Name: is_alphanum, dtype: int64

In [8]:
not_matched.loc[~not_matched['is_alphanum'], 'title'].sample(10).tolist()

['list_of_compositions_by_frédéric_chopin_by_genre',
 'yaoundé',
 'järva_county',
 'bible_de_genève',
 'ryōtei',
 '2014_campeonato_brasileiro_série_a___results_of_first_round',
 'université_de_versailles_saint_quentin_en_yvelines',
 'bible_d_olivétan',
 'napoléon_iii',
 'communes_of_the_isère_department']

In [9]:
not_matched.loc[not_matched['is_alphanum'], 'title'].sample(10).tolist()

['what_s_up_',
 'swamp_cooler',
 'ratnasambhava',
 'shenandoah_valley_academy',
 'helmuth_schmidt',
 'monjayaki',
 'national_parks_of_austria',
 'sahara__2017_movie_',
 'susan_featherly',
 'i_am_not_a_witch']

In [10]:
filemap[filemap['filename'].str.startswith('cinnamon')]

Unnamed: 0,filename,path,html_path
44429,cinnamon,/media/rtn/Windows 10/work/univier/wiki_extrac...,/media/rtn/Windows 10/work/univier/wiki_extrac...


In [11]:
filemap[filemap['filename'].str.contains("'")]

Unnamed: 0,filename,path,html_path


### OK, let's proceed with what we have

In [12]:
domain_articles.shape

(15190,)

In [13]:
len(set(domain_articles).intersection(filemap['filename']))

13112

In [14]:
filemap_f = filemap[filemap['filename'].isin(domain_articles)].copy()
print(filemap.shape)
print(filemap_f.shape)

(223619, 3)
(13112, 3)


### read target

In [15]:
target = pd.read_csv('data/queries.tsv', sep='\t', header=None)
target.columns = ['query', 'title']
target['title'] = process_title(target['title'])

target.shape

(200, 2)

In [16]:
target.head()

Unnamed: 0,query,title
0,animals that have shells and live in water,shell__zoology_
1,how many different types of scorpions are there,scorpion
2,describe the structure of a scientific name fo...,binomial_nomenclature
3,what are the 3 types of plastids in plant cells,plastid
4,who named the cell and how did he come up with...,cell_theory


In [17]:
len(set(target['title']).intersection(filemap['filename']))

137

In [18]:
len(set(target['title']).intersection(filemap_f['filename']))

137

## functions

In [21]:
def get_article_path(title):
    return filemap.query('filename == @title.lower().strip()')['path'].iloc[0]

def get_article_text(fp):
    with open(fp) as fin: text = fin.read()
    return text

In [22]:
title = filemap.sample()['filename'].iloc[0]
print(f'title: "{title}"')
get_article_text(get_article_path(title))

title: "telfair_county__georgia"


'Telfair County is a county in the U.S. state of Georgia. As of the 2010 census, 16,500 people lived there. The county seat is McRae–Helena.'

In [23]:
get_article_text(get_article_path('bikont'))

' A bikont ("two flagellae") is any eukaryotic organism in the group Bikonta. Many single-celled members of the group, and the presumed ancestor, have two flagella. Another shared trait of bikonts is the fusion of two genes into a single unit. The genes for thymidylate synthase (TS) and dihydrofolate reductase (DHFR) code for a single protein with two functions. The genes are separately translated in unikonts.'