This notebook implements an experimental idea: get parallel wikipedia pages (i.e. a page in Bavarian and a page in German), apply pre-trained NER models to German texts and check, if the same NE are mentioned in Bavarian

### 0. Install and import packages to process Wikipedia and for NER

In [10]:
# !pip3 install wikipedia-api caffeine

In [3]:
import caffeine 
import wikipediaapi
import glob
import json
import pandas as pd
from tqdm import tqdm_notebook as tqdm 

from nltk.tokenize import sent_tokenize

In [None]:
### 1. Extract page titles from Wikipedia dumps and store them 

In [12]:
def get_titles(title_file_out, wiki_dump_folder_in):
    fnames = glob.glob(wiki_dump_folder_in)
    with open(title_file_out, 'w', encoding='utf-8') as out_f:
        for filename in  tqdm(fnames, total=len(fnames)):
            filename=filename.replace("\\","/")
            articles = []

            for line in open(filename, 'r'):
                try:
                    articles.append(json.loads(line))
                except:
                    pass

            for article in articles:
                title = article['title']
                out_f.write(title + '\n')
    return 

title_file_out='bar_wiki_titles.txt'
wiki_dump_folder_in='bar_text/**/*'
get_titles(title_file_out, wiki_dump_folder_in)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in  tqdm(fnames, total=len(fnames)):


  0%|          | 0/27 [00:00<?, ?it/s]

In [41]:
import pandas as pd 
bar_titles = open(title_file_out).readlines()
bar_titles = [title.strip() for title in bar_titles]
df = pd.DataFrame (bar_titles, columns = ['bar_title'])
df['de_title'] = ''
df.head()

Unnamed: 0,bar_title,de_title
0,Indus Kuitua,
1,Evoluzion vo da Menschheid,
2,Konsul,
3,Iwagreifands in da Gschicht,
4,Audi Cup,


In [None]:
### 2. Use language links in wikipedia to get acess to same pages in German, collect titles in German and store them 

In [43]:
wiki = wikipediaapi.Wikipedia('bar')
for idx, row in tqdm(df.iterrows(), total = len(df)):
    page = wiki.page(row.bar_title)
    try:
        page_de = page.langlinks['de']
        row.de_title = page_de.title
    except:
        pass
   

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, row in tqdm(df.iterrows(), total = len(df)):


  0%|          | 0/43415 [00:00<?, ?it/s]

In [44]:
df

Unnamed: 0,bar_title,de_title
0,Indus Kuitua,Indus-Kultur
1,Evoluzion vo da Menschheid,Stammesgeschichte des Menschen
2,Konsul,Konsul
3,Iwagreifands in da Gschicht,
4,Audi Cup,Audi Cup
...,...,...
43410,Middlmeea,
43411,Tz,
43412,Dreißgjaariga Kriag,
43413,Hans Magnus Enzensberger,


In [48]:
df = df[df.de_title != '']

In [51]:
len(df.de_title.unique()), len(df.bar_title.unique())

(9887, 13811)

In [None]:
### 4. Extract texts from wikipedia dumps 

In [63]:
def get_text(titles, wiki_dump_folder_in):
    fnames = glob.glob(wiki_dump_folder_in)
    l = []
    with open(title_file_out, 'w', encoding='utf-8') as out_f:
        for filename in  tqdm(fnames, total=len(fnames)):
            filename=filename.replace("\\","/")
            articles = []

            for line in open(filename, 'r'):
                try:
                    articles.append(json.loads(line))
                except:
                    pass

            for article in articles:
                title = article['title']
                if title in titles:
                    l.append([title, article['text']])
    return l

In [77]:
lang = 'bar'
wiki_dump_folder_in=f'{lang}_text/text/**/*'
x = get_text(df[f'{lang}_title'].tolist(), wiki_dump_folder_in) 
df_text = pd.DataFrame.from_records(x, columns = [f'{lang}_title', f'{lang}_text'])
df = df.merge(df_text, on = f'{lang}_title')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in  tqdm(fnames, total=len(fnames)):


  0%|          | 0/27 [00:00<?, ?it/s]

In [80]:
lang = 'de'
wiki_dump_folder_in=f'{lang}_text/**/*'
x = get_text(df[f'{lang}_title'].tolist(), wiki_dump_folder_in) 
df_text = pd.DataFrame.from_records(x, columns = [f'{lang}_title', f'{lang}_text'])
df = df.merge(df_text, on = f'{lang}_title')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in  tqdm(fnames, total=len(fnames)):


  0%|          | 0/7370 [00:00<?, ?it/s]

In [95]:
df = df[(df.bar_text != '') & (df.de_text != '')]
df.to_csv('bar_de_texts.csv')

In [5]:
# resulting data frame
df = pd.read_csv('bar_de_texts.csv', index_col = 0)
df.head()

Unnamed: 0,bar_title,de_title,bar_text,de_text
0,Indus Kuitua,Indus-Kultur,"De Indus Kuitua, dt.: Indus-Kultur, aa Harappa...","Die bronzezeitliche Indus-Kultur, auch ""Indus-..."
1,Evoluzion vo da Menschheid,Stammesgeschichte des Menschen,"In da Evoluzion vo da Menschheid, dt. Evolutio...",Als Stammesgeschichte des Menschen wird das du...
3,Konsul,Konsul,"Da Konsul (Pl.: Konsuln) is a Amtspeason, wejc...","Der Konsul (Plural: Konsuln, abgeleitet vom la..."
4,Audi Cup,Audi Cup,Da Audi Cup is a regelmäßigs zwoadaagegs Fuaßb...,Der Audi Cup ist ein seit 2009 alle zwei Jahre...
7,Biologie und Genetik vo de Menschn,Hominisation,"In da Biologie und Genetik vo de Menschn, dt. ...","Als Hominisation (auch Anthropogenese, selten ..."


In [None]:
### 5. Apply pre-trained NER models to texts in Bavarish and German 

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)



In [7]:
lang = 'de'

df[f'{lang}_sent'] = df[f'{lang}_text'].apply(sent_tokenize)
df[f'{lang}_ner_results'] = ''
for idx, row in tqdm(df.iterrows(), total = len(df)):
    l = []
    for sent in row[f'{lang}_sent']:
        l.append([ne['word'] for ne in nlp(sent)])
    row[f'{lang}_ner_results'] = l

df.to_csv(f'bar_de_texts_ne_{lang}.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, row in tqdm(df.iterrows(), total = len(df)):


  0%|          | 0/8833 [00:00<?, ?it/s]

In [8]:
lang = 'bar'

df[f'{lang}_sent'] = df[f'{lang}_text'].apply(sent_tokenize)
df[f'{lang}_ner_results'] = ''
for idx, row in tqdm(df.iterrows(), total = len(df)):
    l = []
    for sent in row[f'{lang}_sent']:
        l.append([ne['word'] for ne in nlp(sent)])
    row[f'{lang}_ner_results'] = l

df.to_csv(f'bar_de_texts_ne_{lang}.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, row in tqdm(df.iterrows(), total = len(df)):


  0%|          | 0/8833 [00:00<?, ?it/s]

In [9]:
# Results
df

Unnamed: 0,bar_title,de_title,bar_text,de_text,de_sent,de_ner_results,bar_sent,bar_ner_results
0,Indus Kuitua,Indus-Kultur,"De Indus Kuitua, dt.: Indus-Kultur, aa Harappa...","Die bronzezeitliche Indus-Kultur, auch ""Indus-...","[Die bronzezeitliche Indus-Kultur, auch ""Indus...","[[], [], [Indus, ind], [Pakistan, Indiens, Afg...","[De Indus Kuitua, dt., : Indus-Kultur, aa Hara...","[[Indus Kuitua], [Indus, Harappa Kuitua, ind],..."
1,Evoluzion vo da Menschheid,Stammesgeschichte des Menschen,"In da Evoluzion vo da Menschheid, dt. Evolutio...",Als Stammesgeschichte des Menschen wird das du...,[Als Stammesgeschichte des Menschen wird das d...,"[[Homo, sapiens], [Sc], [Hominini], [T. H. Hux...","[In da Evoluzion vo da Menschheid, dt., Evolut...","[[Evoluzion vo da Menschheid], [Hominoidae], [..."
3,Konsul,Konsul,"Da Konsul (Pl.: Konsuln) is a Amtspeason, wejc...","Der Konsul (Plural: Konsuln, abgeleitet vom la...","[Der Konsul (Plural: Konsuln, abgeleitet vom l...","[[late, ##inischen, römischen], [Konsulat], [K...","[Da Konsul (Pl., : Konsuln) is a Amtspeason, w...","[[Ko, ##nsul], [Ko, ##nsul, ##n], [Remischn Re..."
4,Audi Cup,Audi Cup,Da Audi Cup is a regelmäßigs zwoadaagegs Fuaßb...,Der Audi Cup ist ein seit 2009 alle zwei Jahre...,[Der Audi Cup ist ein seit 2009 alle zwei Jahr...,"[[Audi Cup, Audi AG, Allianz Arena, München], ...",[Da Audi Cup is a regelmäßigs zwoadaagegs Fuaß...,"[[Audi Cup, Allianz Arena, Minga, Bayern Minga..."
7,Biologie und Genetik vo de Menschn,Hominisation,"In da Biologie und Genetik vo de Menschn, dt. ...","Als Hominisation (auch Anthropogenese, selten ...","[Als Hominisation (auch Anthropogenese, selten...","[[Homo sapiens], [Hominisation], [], [], [Evol...","[In da Biologie und Genetik vo de Menschn, dt....","[[Biologie und Genetik vo de Menschn], [Biolog..."
...,...,...,...,...,...,...,...,...
13796,"Searcy County, Arkansas",Searcy County,Searcy County is a Bezirk im Bundesstoot Arkan...,Das Searcy County ist ein County im US-Bundess...,[Das Searcy County ist ein County im US-Bundes...,"[[Searcy County, Arkansas], [Marshall], [Dry C...",[Searcy County is a Bezirk im Bundesstoot Arka...,"[[Searcy County, Arkansas, USA], [Bezirk], [Ma..."
13798,"Sebastian County, Arkansas",Sebastian County,Sebastian County is a Beziak im Bundesstoot Ar...,Das Sebastian County ist ein County im US-Bund...,[Das Sebastian County ist ein County im US-Bun...,"[[Sebastian County, Arkansas], [Fort Smith], [...",[Sebastian County is a Beziak im Bundesstoot A...,"[[Sebastian County, Arkansas, USA], [], [Fort ..."
13800,"Sevier County, Arkansas",Sevier County (Arkansas),Sevier County is a Beziak im Bundesstoot Arkan...,Das Sevier County ist ein County im US-Bundess...,[Das Sevier County ist ein County im US-Bundes...,"[[Sevier County, Arkansas], [De Queen], [Dry C...",[Sevier County is a Beziak im Bundesstoot Arka...,"[[Sevier County, Arkansas, USA], [], [De Queen..."
13802,"Sharp County, Arkansas",Sharp County,Sharp County is a Beziak im Bundesstoot Arkans...,Das Sharp County ist ein County im US-Bundesst...,[Das Sharp County ist ein County im US-Bundess...,"[[Sharp County, Arkansas], [Ash Flat], [Dry Co...",[Sharp County is a Beziak im Bundesstoot Arkan...,"[[Sharp County, Arkansas, USA], [], [Ash Flat]..."


In [None]:
# 6. TBA process results 