In [1]:
import pandas as pd
from pathlib import Path

In [2]:
def capitalize(df):
    for column in df.columns:
        df[column] = df[column].str.title()
    return df

## Extract Articles from Books

In [3]:
def generate_database_from_books():
    source_dir = "source"
    source_files = [file for file in Path(source_dir).glob("*.txt")]

    # read texts
    all_texts = []
    for source_file in source_files:
        with open(source_file, "r") as text_file:
            all_texts.append(text_file.readlines())

    articles = ["der", "die", "das", "den", "dem"]

    unwanted_chars = ['"', ',', '.', ':',"»", '«', "_", ")", "(","?"]

    target_lines = []
    for each_text in all_texts:
        for each_line in each_text:
            words = each_line.split()
            for index, word in enumerate(words):
                if word in articles:
                    entry = {}
                    next_index = words.index(word)+1
                    if next_index < len(words):  
                        noun = words[next_index]
                        for char in unwanted_chars:
                            noun = noun.replace(char,"")
                        entry["article"] = word
                        entry["noun"] = noun
                        target_lines.append(entry)

    df = pd.DataFrame(target_lines)
    df.drop_duplicates(inplace=True)

    df["index"] = list(range(len(df)))
    df.index = df["index"]
    df = df.drop("index", axis=1)

    df = capitalize(df)
    df.to_csv("output/data.csv")
    return df

In [4]:
generate_database_from_books().columns

Index(['article', 'noun'], dtype='object')

# Extract Articles from  Frequency

In [5]:
def generate_dataframe_from_frequency():
    source_file = "source/frequent_nouns.txt"
    with open(source_file, "r") as text_file:
        lines = text_file.readlines()

    data = []
    for line in lines:   
        dash_split = line.split("–") 
        eng = dash_split[0].split(".")[1]
        split = (dash_split[1].split("\n")[0].split(" "))[1:3]
        article = split[0]
        if len(split)!=2:
            continue
        noun = split[1]
        data.append([eng, article, noun])

    df = pd.DataFrame(data)

    df.rename(columns = {0:'Translation', 1:'Article', 2:"Noun"}, inplace = True) 

    articles = ["Die", "Der", "Das"]
    df = df[df.Article.apply(lambda x:x in articles)]
    df = df.reindex(columns=['Article', 'Noun',"Translation"])

    df = capitalize(df)
    df.to_csv("output/data.csv")
    return df

In [6]:
generate_dataframe_from_frequency().columns

Index(['Article', 'Noun', 'Translation'], dtype='object')

## Extract Articles from  A2

In [7]:
def generate_dataframe_from_A2():
    source_file = "source/GermanArticlesDataset - A2.csv"
    df = pd.DataFrame(pd.read_csv(source_file))
    df = df.reindex(columns=['Article', 'Noun',"Translation", "Traducción"])
    df = capitalize(df)
    df.to_csv("output/data.csv")
    return df

In [10]:
generate_dataframe_from_A2()

Unnamed: 0,Article,Noun,Translation,Traducción
0,Die,Anrufbeantworter,Answering Machines,Contestadores
1,Die,Ansagen,Advertisements,Anuncios
2,Die,Anschlüsse,Connections,Conexiones
3,Die,Antworten,Answers,Respuestas
4,Die,Anzeigen,Advertisements,Anuncios
...,...,...,...,...
623,Die,Zeit,Weather,Tiempo
624,Die,Zeitschrift,Magazine,Revista
625,Die,Zeitung,Newspaper,Periódico
626,Die,Zigarette,Cigarette,Cigarillo
