# 1. Text Extraction from Corporate reports (PDFs)

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import fitz # PyMuPDF
import nltk
from nltk.tokenize import sent_tokenize
# Ensure NLTK 'punkt' resource is downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/luca/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Define File Paths

In [2]:
path_pdf  = '../Data/Input/Reports'
fname_out = '../Data/Output/CorpRepSentences.csv'

### Define Function: Get count of words (only alphabets)

In [3]:
def get_cnt(text):
    cnt = 0
    for word in text.split():
        if word.isalpha():
            cnt += 1
    return cnt

### Define Function: Get text from blocks

In [4]:
def get_Mytext(block_lst):

    MIN_WORD_CNT = 10 
    
    text_lst = []
    for block in block_lst:
        if block[6] != 0: continue # block_type: 0 = text
    
        text = block[4]
        text = text.replace('fi ', 'fi') # PyMuPDF(fitz) bug fix: 'fi ' --> 'fi'
    
        if get_cnt(text) < MIN_WORD_CNT: continue # Delete sentences with less than MIN_WORD_CNT(10) 
    
        text_lst.append(text.replace('-\n', '')) # To ensures that any hyphenated words split across lines are joined properly in the resulting text.
        
    return ('\n'.join(text_lst))

### Define Function: Get sentences from PDF files

In [5]:
def get_sentence(fname, skip_page = (0,)):
 
    doc = fitz.open(fname)
    
    sent_lst = []
    for page_no, page in enumerate(doc):

        # Skip page
        if page_no+1 in skip_page: continue
        
        block_lst = page.get_text('blocks') #####
        text = get_Mytext(block_lst)
    
        for i, sentence in enumerate(sent_tokenize(text)):
            r_sent = ' '.join(sentence.split()) # Delete '\n', '\t' and strip
            sent_lst.append(r_sent)
            
    doc.close()

    return sent_lst

### Define Function: Generate document (DataFrame)

In [6]:
def gen_document(doc_id, fname, sent_lst):

    res_df = pd.DataFrame(
        {
            'doc_id': doc_id,
            'fname': fname,
            'sentence': sent_lst
        }
    )
    
    return res_df

### Define Function: Read PDF file list

In [7]:
def read_filelist(path):

    # Create empty DataFrame
    df = pd.DataFrame()
    
    # Read and sort the file list (directory)
    file_list = sorted(os.listdir(path))

    # Iterate through the sorted file list
    for idx, fname in enumerate(file_list):
    # Read file list (directory)
    # for idx, fname in enumerate(os.listdir(path)):
        p_fname = os.path.join(path, fname)
        print('path + fname >>>', p_fname)
        
        if p_fname.split('.')[-1] != 'pdf': continue # Check for 'pdf' extension
        print('fname >>>',fname)
    
        doc_id = int(idx)
        
        print(f'doc_id = [{doc_id}], fname = [{fname}]')
        print('')
    
        sent_lst = get_sentence(p_fname)
        df_doc   = gen_document(doc_id, fname, sent_lst)
        
        df = pd.concat([df,df_doc])
        
    return df

### Create DataFrame from PDF Files

In [8]:
%%time
df = read_filelist(path_pdf)
print('==== End of jobs ====')

path + fname >>> ../Data/Input/Reports/.DS_Store
path + fname >>> ../Data/Input/Reports/Asda_2020.pdf
fname >>> Asda_2020.pdf
doc_id = [1], fname = [Asda_2020.pdf]

path + fname >>> ../Data/Input/Reports/Asda_2021.pdf
fname >>> Asda_2021.pdf
doc_id = [2], fname = [Asda_2021.pdf]

path + fname >>> ../Data/Input/Reports/Asda_2022.pdf
fname >>> Asda_2022.pdf
doc_id = [3], fname = [Asda_2022.pdf]

path + fname >>> ../Data/Input/Reports/Asda_2023.pdf
fname >>> Asda_2023.pdf
doc_id = [4], fname = [Asda_2023.pdf]

path + fname >>> ../Data/Input/Reports/Ford_2014.pdf
fname >>> Ford_2014.pdf
doc_id = [5], fname = [Ford_2014.pdf]

path + fname >>> ../Data/Input/Reports/Ford_2015.pdf
fname >>> Ford_2015.pdf
doc_id = [6], fname = [Ford_2015.pdf]

MuPDF error: unsupported error: cannot create appearance stream for RichMedia annotations

MuPDF error: unsupported error: cannot create appearance stream for RichMedia annotations

MuPDF error: unsupported error: cannot create appearance stream for RichM

In [9]:
df

Unnamed: 0,doc_id,fname,sentence
0,1,Asda_2020.pdf,Our action on sustainability supports the broa...
1,1,Asda_2020.pdf,"In particular, our efforts are contributing to..."
2,1,Asda_2020.pdf,"For example, our work to tackle food poverty i..."
3,1,Asda_2020.pdf,Our CCFB strategy covers every aspect of our b...
4,1,Asda_2020.pdf,It also covers International Procurement and L...
...,...,...,...
1464,84,Toyota_2023.pdf,Environmental Data [O] Remanufactured and Used...
1465,84,Toyota_2023.pdf,306-2 Management of significant waste-related ...
1466,84,Toyota_2023.pdf,407-1 Operations and suppliers in which the ri...
1467,84,Toyota_2023.pdf,416-1 Assessment of the health and safety impa...


### Export Data to CSV File

In [10]:
df.to_csv(fname_out, index=False)

# End of file