# Preprocessing 

In [22]:
import pandas as pd
import glob
import os
from pathlib import Path
import re
import pickle
import string
import numpy as np
from striprtf.striprtf import rtf_to_text
from tqdm import tqdm
from PyPDF2 import PdfReader

## Loading files into the Data Frame

In [32]:
# Specify the folder containing the PDFs
folder_path = "../data/sec_CAN/MacLeans/"

# Initialize lists to store data
file_names = []
num_pages = []
texts = []

# Get the list of PDF files in the folder
pdf_files = [filename for filename in os.listdir(folder_path) if filename.endswith('.pdf')]

# Create a tqdm progress bar to track file processing
for filename in tqdm(pdf_files, desc="Processing PDFs"):
    # Get the full file path
    file_path = os.path.join(folder_path, filename)

    # Open the PDF file
    pdf_reader = PdfReader(open(file_path, 'rb'))

    # Get the number of pages
    num_pages.append(len(pdf_reader.pages))

    # Initialize a list to store the text of each page
    page_texts = []

    # Extract text from each page
    for i in range(len(pdf_reader.pages)):
        pages = pdf_reader.pages[i]
        page_texts.append(pages.extract_text())

    # Store the file name and page texts in lists
    file_names.append(filename)
    texts.append(page_texts)

# Create a DataFrame from the lists
df = pd.DataFrame({'File Name': file_names, 'Number of Pages': num_pages, 'Texts': texts})

Processing PDFs: 100%|████████████████████████████| 5/5 [00:52<00:00, 10.42s/it]


In [39]:
tmp=df.explode('Texts')

In [40]:
tmp

Unnamed: 0,File Name,Number of Pages,Texts
0,MacLeans October 2020-1.pdf,124,HOW NOVA SCOTIA \nFAMILIES FINALLY WON \nTHEIR...
0,MacLeans October 2020-1.pdf,124,
0,MacLeans October 2020-1.pdf,124,4 The Editorial |6 Letters |8 Columns Pam Pal...
0,MacLeans October 2020-1.pdf,124,EDITOR-IN-CHIEF Alison Uncles\nDEPUTY EDITO...
0,MacLeans October 2020-1.pdf,124,Read up.\nCanada’s magazine\nSINCE 1905Because...
...,...,...,...
4,MacLeans April 2021-6.pdf,100,SPECIAL ISSUE · YEAR ONE\nfaced the biggest na...
4,MacLeans April 2021-6.pdf,100,Julie Nolke has been experiencing the pandemic...
4,MacLeans April 2021-6.pdf,100,SPECIAL ISSUE · YEAR ONE\nnot political ones.”...
4,MacLeans April 2021-6.pdf,100,


In [41]:
possible_words = ['pandemic', 'COVID', 'covid','COVID-19','covid-19','virus']
tmp = tmp[tmp['Texts'].str.contains('|'.join(possible_words))]

In [42]:
possible_words = ['Church','church','Evangelicals','evangelicals','Islam','islam','Jewish','jewish']
tmp = tmp[tmp['Texts'].str.contains('|'.join(possible_words))]

In [48]:
tmp=tmp[['File Name','Texts']]

In [51]:
tmp.rename(columns={'File Name':'f_name','Texts':'raw_txt_l'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp.rename(columns={'File Name':'f_name','Texts':'raw_txt'},inplace=True)


In [79]:
# Define the directory path where RTF files are located
dir_path = "../data/sec_CAN/"

def get_docx_files(dir_path):
    """
    Retrieves a list of RTF file paths within a specified directory and its subdirectories.

    Parameters:
    dir_path (str): The directory path to search for RTF files.

    Returns:
    list: A list of file paths to RTF files found within the directory and its subdirectories.
    """
    docx_files = []
    
    for root, _, files in os.walk(dir_path):
        for f in files:
            if f.endswith('.rtf'):
                docx_files.append(os.path.join(root, f))
    
    return docx_files

# Get a list of RTF files in the specified directory and its subdirectories
docx_files = get_docx_files(dir_path)

# Create an empty list to store data extracted from the RTF files
data = []

# Iterate over all RTF files in the directory
for filename in docx_files:
    if filename.endswith(".rtf"):
        with open(filename, 'rb') as file:
            rtf_content = file.read().decode('utf-8')  # Decode binary content to string
            
            # Assuming you have a function rtf_to_text that converts RTF to plain text
            rtf_text = rtf_to_text(rtf_content)
            
            # Append the filename (excluding the ".rtf" extension) and the extracted text to the data list
            data.append([filename[:-5], rtf_text])

# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(data, columns=['f_name', 'raw_txt'])

In [80]:
len(df)

25

In [81]:
df['raw_txt_l']=df['raw_txt'].apply(lambda x: x.split('\n\n\n\n\n\n'))

In [82]:
df=df.explode('raw_txt_l')

In [83]:
df=pd.concat([df,tmp])

In [84]:
df=df[['f_name','raw_txt_l']]

## text cleaning

In [85]:
def clean(text):
    text = re.sub("http\S+|www\S+|fb.com\S+|Sunday Life\S+", '', text)
    text = ''.join([char for char in text if char not in '–!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”·‘'])
    text = re.sub('\n',' ',text)
    text = re.sub('‹|›|„','',text)
    text = re.sub(r'\b\d+\b\s*$', '', text)
    text = re.sub(r'\b\w*\d+\w*\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'•\t', '', text)
    return text.strip()

In [86]:
df['cleaned_txt'] = df['raw_txt_l'].apply(clean)

In [87]:
df['len_text'] = df['cleaned_txt'].apply(lambda x: len(x.split()))

In [88]:
df['len_text'].describe()

count     2343.000000
mean      1900.397354
std       2314.562214
min         99.000000
25%        970.000000
50%       1320.000000
75%       1937.500000
max      19707.000000
Name: len_text, dtype: float64

In [89]:
df=df[df['len_text']>300]

In [90]:
len(df)

2316

In [91]:
di={'Englisch':'','©':'','ENGLISCH':'','GMBN':'','The Globe and Mail Inc.':'', 'All Rights Reserved':'','News':'',' National Post':''}
df['cleaned_txt'] = df['cleaned_txt'].replace(di, regex=True)

In [92]:
df=df.reset_index(drop=True)

In [93]:
df['cleaned_txt'][0]

'SEOpinion HDJesus Christs rebel roots have been long forgotten by the Wests Christian right BYBy MICHAEL COREN Wörter Oktober SNThe Globe and Mail SCGLOB EDOntario LA CY  LPOPINION Athor of books including his newly published The Rebel Christ Earlier this month the Sunday Gospel reading in Anglican and Roman Catholic Churches the world over was one thats known even to atheists It is easier for a camel to go through the eye of a needle than for someone who is rich to enter the kingdom of God TDTheres context to this of course as there always is with any ancient writing but at its core is the essence of Jesus Here was the son of a firstcentury Jewish carpenter living in an occupied land as the friend of the marginalized rejected and poor criticizing as he so often did the powerful legalistic and materialistic Its a line that is part of his broader preaching on communality and human equality with demands that were entirely revolutionary In short he was the rebel Christ This might surpris

In [94]:
possible_words = ['pandemic', 'COVID', 'covid','COVID-19','covid-19','virus']

df = df[df['cleaned_txt'].str.contains('|'.join(possible_words))]

In [95]:
df=df[['f_name','cleaned_txt']]

In [97]:
df.to_csv('../data/clean/sec_can_clean.csv')

In [96]:
df

Unnamed: 0,f_name,cleaned_txt
0,../data/sec_CAN/Globe and Mail Canada/Factiva-...,SEOpinion HDJesus Christs rebel roots have bee...
1,../data/sec_CAN/Globe and Mail Canada/Factiva-...,SEPursuits HDOriginal kin BYBy Ian Brown CRSta...
3,../data/sec_CAN/Globe and Mail Canada/Factiva-...,SESports HDBIRTH AND DEATH NOTICES Wörter Okto...
4,../data/sec_CAN/Globe and Mail Canada/Factiva-...,SELife Arts HDWHEN DAD AND I BOTH HAD CANCER B...
6,../data/sec_CAN/Globe and Mail Canada/Factiva-...,HDCouncillor Jyoti Gondek wins mayoral race in...
...,...,...
2311,MacLeans january 2021.pdf,there skyscrapers skyscrape there T rump we co...
2312,MacLeans january 2021.pdf,ence from restrictions—particularly when it ke...
2313,MacLeans April 2021-6.pdf,SPECIAL ISSUE YEAR ONE people were wrong think...
2314,MacLeans April 2021-6.pdf,SPECIAL ISSUE YEAR ONE public health expertise...
