In [None]:
This notebook downloads Wikipedias in different languages and extracts raw texts. Next, it creates frequence distibutions for each language and seed lists for languages other than German. Seeds are those words, which occur frequently in one of the languages, but not in German. 

### 0. Install and import packages to process Wiki and data 

In [None]:
!pip3 install wikiextractor nltk pandas spacy wikipedia wikipedia-api tqdm widgetsnbextension ipywidgets

In [None]:
# jupyter lab hacks to support tqdm
!jupyter nbextension install --user --py widgetsnbextension
!jupyter nbextension enable --user --py widgetsnbextension
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [4]:
from tqdm.notebook import tqdm_notebook as tqdm
from nltk.tokenize import sent_tokenize


import nltk
nltk.download('punkt')

import caffeine 

from collections import Counter
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/katyaartemova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1. Download & extract texts from wikipedia dumps
This takes quite some time, esp for larger dumps 

In [None]:
# download wiki dumps
# !wget https://dumps.wikimedia.org/barwiki/latest/barwiki-latest-pages-articles-multistream.xml.bz2
# !wget https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles-multistream.xml.bz2
# !wget https://dumps.wikimedia.org/lbwiki/latest/lbwiki-latest-pages-articles-multistream.xml.bz2
# !wget https://dumps.wikimedia.org/alswiki/latest/alswiki-latest-pages-articles-multistream.xml.bz2

In [1]:
# extract raw texts from wiki dumps
!mkdir wiki
!python3 -m wikiextractor.WikiExtractor barwiki-latest-pages-articles-multistream.xml.bz2 --json -o wiki/bar_text
!python3 -m wikiextractor.WikiExtractor dewiki-latest-pages-articles-multistream.xml.bz2 --json -o wiki/de_text
!python3 -m wikiextractor.WikiExtractor alswiki-latest-pages-articles-multistream.xml.bz2 --json -o wiki/als_text
!python3 -m wikiextractor.WikiExtractor lbwiki-latest-pages-articles-multistream.xml.bz2 --json -o wiki/lb_text

!rm barwiki-latest-pages-articles-multistream.xml.bz2 
!rm dewiki-latest-pages-articles-multistream.xml.bz2
!rm alswiki-latest-pages-articles-multistream.xml.bz2
!rm lbwiki-latest-pages-articles-multistream.xml.bz2

INFO: Preprocessing 'barwiki-latest-pages-articles-multistream.xml.bz2' to collect template definitions: this may take some time.
INFO: Loaded 7517 templates in 3.9s
INFO: Starting page extraction from barwiki-latest-pages-articles-multistream.xml.bz2.
INFO: Using 7 extract processes.
INFO: Finished 7-process extraction of 43416 articles in 10.6s (4094.4 art/s)
INFO: Preprocessing 'dewiki-latest-pages-articles-multistream.xml.bz2' to collect template definitions: this may take some time.
INFO: Preprocessed 100000 pages
INFO: Preprocessed 200000 pages
INFO: Preprocessed 300000 pages
INFO: Preprocessed 400000 pages
INFO: Preprocessed 500000 pages
INFO: Preprocessed 600000 pages
INFO: Preprocessed 700000 pages
INFO: Preprocessed 800000 pages
INFO: Preprocessed 900000 pages
INFO: Preprocessed 1000000 pages
INFO: Preprocessed 1100000 pages
INFO: Preprocessed 1200000 pages
INFO: Preprocessed 1300000 pages
INFO: Preprocessed 1400000 pages
INFO: Preprocessed 1500000 pages
INFO: Preprocessed 16

In [2]:
# list of current languagess 
langs  = ['bar','de','als','lb']

In [5]:
# extract raw texts to json files 
!mkdir raw 

import glob
import json


def process_wiki(wiki_dump_file_out, wiki_dump_folder_in):
    fnames = glob.glob(wiki_dump_folder_in)
    with open(wiki_dump_file_out, 'w', encoding='utf-8') as out_f:
        for filename in  tqdm(fnames, total=len(fnames)):
            filename=filename.replace("\\","/")
            articles = []

            for line in open(filename, 'r'):
                try:
                    articles.append(json.loads(line))
                except:
                    pass

            for article in articles:
                sentences = sent_tokenize(article['text'])
                for sentence in sentences:
                    out_f.write(sentence + '\n')
    return 


for lang in langs:
    wiki_dump_folder_in=f'wiki/{lang}_text/**/*'
    wiki_dump_file_out=f'raw/{lang}_wiki.txt'
    process_wiki(wiki_dump_file_out, wiki_dump_folder_in)

mkdir: raw: File exists


  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/7370 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

### 2. Frequency distribution 
Results:
* frequency dist for each language 
* frequency dist for 'bar','de','als' languages minus words that occur in German. These couls serve as seeds further 

In [None]:
# compute freq dists
!mkdir fd

def compute_freq_dist(lang):
    wiki_dump_file_out=f'raw/{lang}_wiki.txt'

    freq_dist = Counter()
    with open(wiki_dump_file_out,encoding='utf-8') as in_file:
        for sent in tqdm(in_file):
            tokens = re.findall('\w+', sent.strip().lower())
            freq_dist.update(tokens)
    df = pd.DataFrame.from_records(freq_dist.most_common(), columns=['token','count'])
    df.to_csv(f'fd/{lang}_freq_dist.csv')

for lang in langs:
    compute_freq_dist(lang)

In [None]:
# compute seeds
!mkdir seed

def compute_seeds(lang, de_df):
    lang_df = pd.read_csv(f'fd/{lang}_freq_dist.csv',usecols=['token','count'])
    lang_df[~lang_df.token.isin(de_df.token)].to_csv(f'seed/{lang}_seed.csv')

de_df = pd.read_csv('de_freq_dist.csv',usecols=['token','count'])

for lang in langs:
    if not lang == 'de':
        compute_seeds(lang, de_df)