# Filtering the Wiki dataset

In [2]:
import numpy as np
import pandas as pd
import csv

In [4]:
df = pd.DataFrame.from_csv('data/enwiktionary-latest-all-titles-in-ns0', sep='\t')

In [5]:
df = df.reset_index()
print(len(df))
df.head()

5442799


Unnamed: 0,page_title
0,!
1,!!
2,!!!
3,!'O!Kung
4,!'O!uŋ


Keep only compound titles

In [6]:
df = df[df.page_title.str.contains('_', na=False)]
print(len(df))
df.head()

279105


Unnamed: 0,page_title
12,!O_Kung
30,!nôhm-tâ_súbu-sùbu
34,!nōo_qùli
39,!qhàa_gǀqhùã_a̰a
40,!qhàa_gǀqhùã_qáe


In [7]:
df.page_title = df.page_title.str.lower()

Keep only titles which contains regular latin letters

In [8]:
df_filtered = df[df.page_title.str.contains("^[a-z_]+$", na=False, regex=True)]
print(len(df_filtered.page_title))
df_filtered.head()

196241


Unnamed: 0,page_title
10472,aba_form
10491,abc_islands
10492,abc_news
10493,abc_art
10494,abc_book


In [9]:
# Remove entries with more that 2 parts in word
df_filtered = df_filtered[df_filtered.page_title.str.replace('[a-z]', '').apply(len) < 2]

In [10]:
print(len(df_filtered))
compound_names = df_filtered.page_title
compound_names.head()

156722


10472       aba_form
10491    abc_islands
10492       abc_news
10493        abc_art
10494       abc_book
Name: page_title, dtype: object

Save the extracted compound names

In [20]:
compound_names_sorted = compound_names.sort_values(ascending=True)
compound_names_sorted.to_pickle("data/filtered_compound_words.pickle.gz", compression='gzip')

Verify that we can indeed retrieve compounds words:

In [21]:
compound_names_sorted[compound_names_sorted=="hard_drive"]

2108829    hard_drive
Name: page_title, dtype: object

In [22]:
def is_df_sorted(df):
    return pd.Index(df).is_monotonic

In [23]:
def search_on_filtered(df, word):
    i = df.searchsorted(word)
    #print(i)
    x = df.iloc[i].values
    #print(x)
    return x[0] == word    

In [24]:
%timeit search_on_filtered(compound_names_sorted, "technical_support")
%timeit search_on_filtered(df.page_title, "technical_support")

10000 loops, best of 3: 134 µs per loop
10000 loops, best of 3: 144 µs per loop


In [16]:
print(search_on_filtered(compound_names, "technical_ésupport"))

False


In [25]:

print(is_df_sorted(compound_names_sorted))

True
