# Preprocessing 

In [71]:
import pandas as pd
import glob
import os
from pathlib import Path
import re
import pickle
import string
import numpy as np
from striprtf.striprtf import rtf_to_text

## Loading files into the Data Frame

In [3]:
# Define the directory path where RTF files are located
dir_path = "../data/sec_IRL/"

def get_docx_files(dir_path):
    """
    Retrieves a list of RTF file paths within a specified directory and its subdirectories.

    Parameters:
    dir_path (str): The directory path to search for RTF files.

    Returns:
    list: A list of file paths to RTF files found within the directory and its subdirectories.
    """
    docx_files = []
    
    for root, _, files in os.walk(dir_path):
        for f in files:
            if f.endswith('.rtf'):
                docx_files.append(os.path.join(root, f))
    
    return docx_files

# Get a list of RTF files in the specified directory and its subdirectories
docx_files = get_docx_files(dir_path)

# Create an empty list to store data extracted from the RTF files
data = []

# Iterate over all RTF files in the directory
for filename in docx_files:
    if filename.endswith(".rtf"):
        with open(filename, 'rb') as file:
            rtf_content = file.read().decode('utf-8')  # Decode binary content to string
            
            # Assuming you have a function rtf_to_text that converts RTF to plain text
            rtf_text = rtf_to_text(rtf_content)
            
            # Append the filename (excluding the ".rtf" extension) and the extracted text to the data list
            data.append([filename[:-5], rtf_text])

# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(data, columns=['f_name', 'raw_txt'])

In [6]:
len(df)

11

In [31]:
df['raw_txt_l']=df['raw_txt'].apply(lambda x: x.split('\n\n\n\n\n\n'))

In [34]:
df=df.explode('raw_txt_l')

In [35]:
df

Unnamed: 0,f_name,raw_txt,raw_txt_l
0,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,\n\n\nNews\nSaudi princes detained amid coup ...
0,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,"News\nWe should learn a sense [...]\n\n1,192 W..."
0,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,News\nDonald Trump is no buffoon or tangerine ...
0,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,News\nPERSONAL PRIVACY: SACRIFICED ON THE ALTA...
0,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,News\nA tale of two brothers United after 80 y...
...,...,...,...
10,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,"\n\n\nNews\n'God is not a killjoy, but if He ...",News; Teasers\nHow well do you remember 2022?\...
10,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,"\n\n\nNews\n'God is not a killjoy, but if He ...",News\nKey events that played a role in the phe...
10,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,"\n\n\nNews\n'God is not a killjoy, but if He ...",News\nGroup targeted by racists puts city base...
10,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,"\n\n\nNews\n'God is not a killjoy, but if He ...",News; Teasers\n'The Gospel remains as relevant...


## text cleaning

In [36]:
def clean(text):
    text = re.sub("http\S+|www\S+|fb.com\S+|Sunday Life\S+", '', text)
    text = ''.join([char for char in text if char not in '–!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”·‘'])
    text = re.sub('\n',' ',text)
    text = re.sub('‹|›|„','',text)
    text = re.sub(r'\b\d+\b\s*$', '', text)
    text = re.sub(r'\b\w*\d+\w*\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'•\t', '', text)
    return text.strip()

In [37]:
df['cleaned_txt'] = df['raw_txt_l'].apply(clean)

In [38]:
df['len_text'] = df['cleaned_txt'].apply(lambda x: len(x.split()))

In [39]:
# sprawdzam jak długi jest tekst i czy czegoś nie ucięło, wyrywkowo
df['len_text'].describe()

count     742.000000
mean      875.175202
std       608.424868
min       107.000000
25%       481.000000
50%       707.000000
75%      1081.250000
max      5508.000000
Name: len_text, dtype: float64

In [40]:
df=df[df['len_text']>300]

In [58]:
di={'Sunday Life':'','News':'','The Irish News':'','ENGLISCH':'','The Irish Times':'','IRTI':'','Dokument':'','The Sunday Independent':'','FSII':'','Independent Newspapers Ireland Ltd':'',
    'SUNLIF':'','Independent News & Media (Northern Ireland)':'','Belfast Telegraph':'','WBEL':'','Englisch':'','©':'','Independent papers Ireland Ltd':''}
df['cleaned_txt'] = df['cleaned_txt'].replace(di, regex=True)

In [59]:
df=df.reset_index(drop=True)

In [65]:
df['cleaned_txt'][0]

' Saudi princes detained amid coup allegations Daniel Wallis Wörter März   National    SAUDI Arabia has detained three senior Saudi princes including the younger brother of King Salman and the kings nephew for allegedly planning a coup sources with knowledge of the matter said Crown Prince Mohammed bin Salman King Salmans son and de facto ruler of the worlds top oil exporter and key US ally has moved to consolidate power since ousting Mohammed bin Nayef as heir to the throne in a palace coup Later that year Crown Prince Mohammed bin Salman known as MbS arrested several royals and other prominent Saudis holding them for months at Riyadhs Ritz Carlton hotel in an anticorruption campaign that caused shockwaves at home and abroad Sources told reporters that MbS accused the princes of conducting contacts with foreign powers including the Americans and others to carry out a coup detat the regional source said With these arrests MbS consolidated his full grip on power Its over with this purge

In [66]:
possible_words = ['pandemic', 'COVID', 'covid','COVID-19','covid-19']

df = df[df['cleaned_txt'].str.contains('|'.join(possible_words))]

In [69]:
df=df[['f_name','cleaned_txt']]

In [70]:
df.to_csv('../data/clean/sec_irl_clean.csv')

In [67]:
df

Unnamed: 0,f_name,raw_txt,raw_txt_l,cleaned_txt,len_text
1,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,"News\nWe should learn a sense [...]\n\n1,192 W...",We should learn a sense Wörter März Nationa...,1167
3,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,News\nPERSONAL PRIVACY: SACRIFICED ON THE ALTA...,PERSONAL PRIVACY SACRIFICED ON THE ALTAR OF P...,1679
4,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,News\nA tale of two brothers United after 80 y...,A tale of two brothers United after years and...,1116
5,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,News\nThe 'selfless' doctor who always put oth...,The selfless doctor who always put others fir...,929
8,../data/sec_IRL/Sunday Independent/Factiva-202...,\n\n\nNews\nSaudi princes detained amid coup ...,News\nIt's very sad there's unrest in the worl...,Its very sad theres unrest in the world becau...,1711
...,...,...,...,...,...
684,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,\n\n\nFeatures\n'During the Covid -19 crisis...,News\nClergy in NI may fail Prince Philip's se...,Clergy in NI may fail Prince Philips sermon t...,1150
687,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,"\n\n\nNews\n'God is not a killjoy, but if He ...",News\nKey events that played a role in the phe...,Key events that played a role in the phenomen...,839
688,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,"\n\n\nNews\n'God is not a killjoy, but if He ...",News\nGroup targeted by racists puts city base...,Group targeted by racists puts city base up f...,368
689,../data/sec_IRL/Belfast Telegraph/Factiva-2023...,"\n\n\nNews\n'God is not a killjoy, but if He ...",News; Teasers\n'The Gospel remains as relevant...,Teasers The Gospel remains as relevant as eve...,1184
