# Preprocessing 

In [35]:
import docx
import pandas as pd
import glob
import os
from pathlib import Path
import re
import pickle
import string
import numpy as np
# pip install germalemma

## Loading files into the df and extracting the names and year

In [48]:
dir_path = "../data/rel_IRL/"

def get_docx_files(dir_path):
    docx_files = []
    
    for root, _, files in os.walk(dir_path):
        for f in files:
            if f.endswith('.docx'):
                docx_files.append(os.path.join(root, f))
    
    return docx_files

docx_files = get_docx_files(dir_path)

data = []

for file in docx_files:
    doc = docx.Document(file)
    article = ""  # Store the content of the current article
    for para in doc.paragraphs:
        if para.text.strip().startswith("Date of publication"):
            if article:  # Append the previous article
                data.append([file, article.strip()])
                article = ""  # Reset the article content
        article += para.text.strip() + ' '  # Add paragraph to the current article
    if article:  # Append the last article in the file
        data.append([file, article.strip()])

df = pd.DataFrame(data, columns=['file_name', 'article'])

In [49]:
df

Unnamed: 0,file_name,article
0,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Chu...
1,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: The...
2,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Are...
3,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Toy...
4,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Chu...
...,...,...
639,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/01/22 Link: Title: The...
640,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/01/22 Link: Title: Our...
641,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/02/22 Link: Title: 202...
642,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/03/22 Link: Title: Lif...


In [15]:
# df['date']=df['f_name'].apply(lambda x: x.split('/')[3])
# df['date']=df['date'].str[:7]
# df['year']=df['date'].str[:4]

In [51]:
# Function to extract text after "Text of Article:"
def extract_article_text(text):
    article_marker = "Text of article:"
    article_index = text.find(article_marker)
    if article_index != -1:
        return text[article_index + len(article_marker):].strip()
    else:
        return text

# Apply the function to create the new column
df['txt_1'] = df['article'].apply(extract_article_text)

In [52]:
df[df['txt_1'].isna()]

Unnamed: 0,file_name,article,txt_1


In [53]:
df

Unnamed: 0,file_name,article,txt_1
0,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Chu...,A social media poll of over 700 people conduct...
1,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: The...,How should the Catholic Church respond when a ...
2,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Are...,My inclinations are to regard the hullaballoo ...
3,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Toy...,The Tokyo archdiocese has suspended all public...
4,../data/rel_IRL/Irish Catholic Definite.docx,Date of publication: 05/03/20 Link: Title: Chu...,Irish bishops say they are poised to take radi...
...,...,...,...
639,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/01/22 Link: Title: The...,"20 January 2022 The Irish Catholic, Ruadhán Jo..."
640,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/01/22 Link: Title: Our...,At Conference 2021 the Ambassadors shared in t...
641,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/02/22 Link: Title: 202...,"Like many in recent days, I have been consider..."
642,../data/rel_IRL/All Protestant Documents.docx,Date of publication: 01/03/22 Link: Title: Lif...,Over the last 18 months something extremely si...


## text cleaning

In [54]:
def clean(text):
    text = re.sub("http\S+|www\S+|fb.com\S+|programm.ard.de\S+", '', text)
    text = ''.join([char for char in text if char not in '–!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”·‘'])
    text = re.sub('\n',' ',text)
    text = re.sub('‹|›|„','',text)
    text = re.sub(r'\b\d+\b\s*$', '', text)
    text = re.sub(r'\b\w*\d+\w*\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'•\t', '', text)
    return text.strip()

In [55]:
df['cleaned_txt'] = df['txt_1'].apply(clean)

In [56]:
df['len_text'] = df['txt_1'].apply(lambda x: len(x.split()))

In [57]:
# sprawdzam jak długi jest tekst i czy czegoś nie ucięło, wyrywkowo
df['len_text'].describe()

count     644.000000
mean      661.686335
std       508.711678
min        78.000000
25%       313.750000
50%       534.500000
75%       850.250000
max      4302.000000
Name: len_text, dtype: float64

In [58]:
min_row = df.loc[df['len_text'].idxmin()]

# Print the row
print(min_row)

file_name                 ../data/rel_IRL/Muslim Newsletter.docx
article        Date of publication: 03/03/20 Link: Title: Dep...
txt_1          Here is the web link for the Department of Hea...
cleaned_txt    Here is the web link for the Department of Hea...
len_text                                                      78
Name: 407, dtype: object


## Klasyfikacja do denominacji

In [94]:
dict = {
        "Bistum" : 'Cath_Ch', "BR" : 'Cath_Ch', "DBK" : 'Cath_Ch', 'chrismon': 'Cath_Ch', 'katholisch.de': 'Cath_Ch',
        
        "ELKB" : 'Protest', "EKD" : 'Protest', "EVLKS" : 'Protest',

        "AGiD" : 'Anthrop', "Ant" : 'Anthrop',

        "IRD" : 'Muslim', "KRM" : 'Muslim', "ZDM" : 'Muslim', "DITIB" : 'Muslim', "VIKZ" : 'Muslim', "IRD" : 'Muslim', "ZMD" : 'Muslim', "Islamrat": 'Muslim', "IslamischeZeitung": "IRD"
    }

        

In [95]:
for key, value in dict.items():
    df.loc[df['f_name'].str.contains(key), 'Denomination'] = value


In [96]:
df[df['Denomination'].isna()]

Unnamed: 0,date,year,f_name,raw_txt,txt_1,len_text,cleaned_txt,Denomination


In [31]:
df

Unnamed: 0,f_name,raw_txt,txt_1,cleaned_txt,len_text
0,../data/rel_IRL/Irish Catholic Definite,Date of publication: 05/03/20 Link: Title: Chu...,A social media poll of over 700 people conduct...,A social media poll of over people conducted b...,275506
1,../data/rel_IRL/Muslim Newsletter,Date of publication: 28/02/20 Link: Title: HOW...,HOW TO KEEP SAFE IN LIGHT OF THE CORONAVIRUS I...,HOW TO KEEP SAFE IN LIGHT OF THE CORONAVIRUS I...,8349
2,../data/rel_IRL/Humanist Newsletter,Date of publication: 27/03/20 Link: Title: Onl...,"Marking the death of someone is important, and...",Marking the death of someone is important and ...,4969
3,../data/rel_IRL/All Protestant Documents,Date of publication: 01/04/20 Link: Title: Iso...,"What strange times indeed. Weeks ago, it would...",What strange times indeed Weeks ago it would h...,153216


In [59]:
df.to_csv('../data/clean/rel_irl_clean.csv')

In [100]:
# from germalemma import GermaLemma

# lemmatizer = GermaLemma()

# # passing the word and the POS tag ("N" for noun)
# lemma = lemmatizer.find_lemma('Feinstaubbelastungen', 'N')
# print(lemma)
# # -> lemma is "Feinstaubbelastung"