In [83]:
import os
from pathlib import Path
import pandas as pd
import spacy
import re
import codecs

## Import text files

In [84]:
txt_data_path = os.path.join(Path().resolve().parent, "data", "text")

text_data = {}

for year in ["2013", "2017", "2021"]:
    text_data[year] = {}
    for party in ["spd", "fdp", "cdu", "afd", "gruene", "linke"]:
        filename = party + "_" + year + ".txt"
        file_path = os.path.join(txt_data_path, filename)

        # read txt
        in_file = codecs.open(file_path, "r", encoding="utf-8")
        text_data[year][party] = in_file.read()
        in_file.close()

## Clean Data
### General cleansing steps for all texts

In [85]:
for year in ["2013", "2017", "2021"]:
    for party in ["spd", "fdp", "cdu", "afd", "gruene", "linke"]:
        text_data[year][party] = text_data[year][party] \
        .replace("•", "") \
        .replace("-\\n", "") \
        .replace("\\t", "")
        # .replace("--", "") \
        # .replace("(cid:1)\\n", "") \
        
        # .replace("cid:1", "")
        # replace new lines with direct following characters
        text_data[year][party] = re.sub("\\n(.)", " \\1", text_data[year][party])

### Cleaning "Bündnis 90/ Die Grünen" (the Greens) texts

In [86]:
for year in ["2013", "2017", "2021"]:
    header_1 = "Bundestagswahlprogramm " + year + " BÜNDNIS 90/DIE GRÜNEN"
    header_2 = "Bundestagswahlprogramm " + year + " BÜNDNIS 90 / DIE GRÜNEN"
    text_data[year]["gruene"] = \
        text_data[year]["gruene"].replace(header_1, header_2)\
            .replace(header_2, "") 
text_data["2013"]["gruene"] = text_data["2013"]["gruene"].replace("Teilhaben. Einmischen. Zukunft schaffen.", "")
            

## Store cleaned data

In [87]:
txt_data_path = os.path.join(Path().resolve().parent, "data", "cleaned")

for year in ["2013", "2017", "2021"]:
    for party in ["spd", "fdp", "cdu", "afd", "gruene", "linke"]:
        filename = party + "_" + year + ".txt"
        file_path = os.path.join(txt_data_path, filename)

        # write cleaned txt
        out_file = open(file_path, "w+", encoding="utf-8")
        out_file.write(text_data[year][party])
        out_file.close()