# Preprocessing 

In [1]:
import docx
import pandas as pd
import glob
import os
from pathlib import Path
import re
import pickle
import string
import numpy as np
# pip install germalemma

## Loading files into the df and extracting the names and year

In [3]:
dir_path = "../data/rel_CAN/"

def get_docx_files(dir_path):
    docx_files = []
    
    for root, _, files in os.walk(dir_path):
        for f in files:
            if f.endswith('.docx'):
                docx_files.append(os.path.join(root, f))
    
    return docx_files

docx_files = get_docx_files(dir_path)

data = []

for file in docx_files:
    doc = docx.Document(file)
    article = ""  # Store the content of the current article
    for para in doc.paragraphs:
        if para.text.strip().startswith("Date of publication"):
            if article:  # Append the previous article
                data.append([file, article.strip()])
                article = ""  # Reset the article content
        article += para.text.strip() + ' '  # Add paragraph to the current article
    if article:  # Append the last article in the file
        data.append([file, article.strip()])

df = pd.DataFrame(data, columns=['file_name', 'article'])

In [4]:
df

Unnamed: 0,file_name,article
0,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 13 march 2021 Link: https...
1,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 17 december 2021 Link: ht...
2,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 7 june 2021 Link: https:/...
3,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 2 february 2021 Link: htt...
4,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 13 march 2020 Link: https...
...,...,...
404,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 17 june 2020 Link: https:...
405,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 15 june 2020 Link: https:...
406,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 13 august 2020 Link: http...
407,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 9 july 2020 Link: https:/...


In [15]:
# df['date']=df['f_name'].apply(lambda x: x.split('/')[3])
# df['date']=df['date'].str[:7]
# df['year']=df['date'].str[:4]

In [5]:
# Function to extract text after "Text of Article:"
def extract_article_text(text):
    article_marker = "Text of article:"
    article_index = text.find(article_marker)
    if article_index != -1:
        return text[article_index + len(article_marker):].strip()
    else:
        return text

# Apply the function to create the new column
df['txt_1'] = df['article'].apply(extract_article_text)

In [6]:
df[df['txt_1'].isna()]

Unnamed: 0,file_name,article,txt_1


In [7]:
df

Unnamed: 0,file_name,article,txt_1
0,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 13 march 2021 Link: https...,Hasidic communities are again turning to the c...
1,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 17 december 2021 Link: ht...,"Synagogues, like all places of worship in Queb..."
2,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 7 june 2021 Link: https:/...,Toronto Public Health (TPH) has lifted an orde...
3,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 2 february 2021 Link: htt...,MONTREAL— A Quebec judge said she will decide ...
4,../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...,Date of publication: 13 march 2020 Link: https...,Everyone who was at the Beth Chabad community ...
...,...,...,...
404,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 17 june 2020 Link: https:...,"Recently, the U.S Centers for Disease Control ..."
405,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 15 june 2020 Link: https:...,"Every year, Muslims around the world celebrate..."
406,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 13 august 2020 Link: http...,The pandemic lockdown may have impacted our me...
407,../data/rel_CAN/CAN_ISLAM_MUSLIMLINK_2020-2023...,Date of publication: 9 july 2020 Link: https:/...,The global . By relying on online digital medi...


## text cleaning

In [8]:
def clean(text):
    text = re.sub("http\S+|www\S+|fb.com\S+|programm.ard.de\S+", '', text)
    text = ''.join([char for char in text if char not in '–!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”·‘'])
    text = re.sub('\n',' ',text)
    text = re.sub('‹|›|„','',text)
    text = re.sub(r'\b\d+\b\s*$', '', text)
    text = re.sub(r'\b\w*\d+\w*\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'•\t', '', text)
    return text.strip()

In [9]:
df['cleaned_txt'] = df['txt_1'].apply(clean)

In [10]:
df['len_text'] = df['txt_1'].apply(lambda x: len(x.split()))

In [11]:
# sprawdzam jak długi jest tekst i czy czegoś nie ucięło, wyrywkowo
df['len_text'].describe()

count     409.000000
mean      895.017115
std       599.715187
min       146.000000
25%       565.000000
50%       741.000000
75%      1046.000000
max      5107.000000
Name: len_text, dtype: float64

In [12]:
min_row = df.loc[df['len_text'].idxmin()]

# Print the row
print(min_row)

file_name      ../data/rel_CAN/CAN_JEW_CANJEWISHNEWS_2020-202...
article        Date of publication: June 2022 Link: https://t...
txt_1          Last Sunday, 115 teenagers in Ottawa celebrate...
cleaned_txt    Last Sunday teenagers in Ottawa celebrated the...
len_text                                                     146
Name: 5, dtype: object


## Klasyfikacja do denominacji

In [94]:
dict = {
        "Bistum" : 'Cath_Ch', "BR" : 'Cath_Ch', "DBK" : 'Cath_Ch', 'chrismon': 'Cath_Ch', 'katholisch.de': 'Cath_Ch',
        
        "ELKB" : 'Protest', "EKD" : 'Protest', "EVLKS" : 'Protest',

        "AGiD" : 'Anthrop', "Ant" : 'Anthrop',

        "IRD" : 'Muslim', "KRM" : 'Muslim', "ZDM" : 'Muslim', "DITIB" : 'Muslim', "VIKZ" : 'Muslim', "IRD" : 'Muslim', "ZMD" : 'Muslim', "Islamrat": 'Muslim', "IslamischeZeitung": "IRD"
    }

        

In [95]:
for key, value in dict.items():
    df.loc[df['f_name'].str.contains(key), 'Denomination'] = value


In [96]:
df[df['Denomination'].isna()]

Unnamed: 0,date,year,f_name,raw_txt,txt_1,len_text,cleaned_txt,Denomination


In [31]:
df

Unnamed: 0,f_name,raw_txt,txt_1,cleaned_txt,len_text
0,../data/rel_IRL/Irish Catholic Definite,Date of publication: 05/03/20 Link: Title: Chu...,A social media poll of over 700 people conduct...,A social media poll of over people conducted b...,275506
1,../data/rel_IRL/Muslim Newsletter,Date of publication: 28/02/20 Link: Title: HOW...,HOW TO KEEP SAFE IN LIGHT OF THE CORONAVIRUS I...,HOW TO KEEP SAFE IN LIGHT OF THE CORONAVIRUS I...,8349
2,../data/rel_IRL/Humanist Newsletter,Date of publication: 27/03/20 Link: Title: Onl...,"Marking the death of someone is important, and...",Marking the death of someone is important and ...,4969
3,../data/rel_IRL/All Protestant Documents,Date of publication: 01/04/20 Link: Title: Iso...,"What strange times indeed. Weeks ago, it would...",What strange times indeed Weeks ago it would h...,153216


In [13]:
df.to_csv('../data/clean/rel_can_clean.csv')

In [100]:
# from germalemma import GermaLemma

# lemmatizer = GermaLemma()

# # passing the word and the POS tag ("N" for noun)
# lemma = lemmatizer.find_lemma('Feinstaubbelastungen', 'N')
# print(lemma)
# # -> lemma is "Feinstaubbelastung"