# 2. Text Extraction from SDG reports (PDFs)

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import fitz # PyMuPDF
import nltk
from nltk.tokenize import sent_tokenize
# Ensure NLTK 'punkt' resource is downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/luca/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Define File Paths

In [2]:
path_pdf  = '../Data/Input/SDGs'
fname_out = '../Data/Output/SdgReportSentences.csv'

### Define Function: Get count of words (only alphabets)

In [3]:
def get_cnt(text):
    cnt = 0
    for word in text.split():
        if word.isalpha():
            cnt += 1
    return cnt

### Define Function: Get text from blocks

In [4]:
def get_Mytext(block_lst):

    MIN_WORD_CNT = 10 
    
    text_lst = []
    for block in block_lst:
        if block[6] != 0: continue # block_type: 0 = text
    
        text = block[4]
        text = text.replace('fi ', 'fi') # PyMuPDF(fitz) bug fix: 'fi ' --> 'fi'
    
        if get_cnt(text) < MIN_WORD_CNT: continue # Delete sentences with less than MIN_WORD_CNT(10) 
    
        text_lst.append(text.replace('-\n', '')) # To ensures that any hyphenated words split across lines are joined properly in the resulting text.
        
    return ('\n'.join(text_lst))

### Define Function: Get sentences from PDF files

In [5]:
def get_sentence(fname, skip_page = (0,)):
 
    doc = fitz.open(fname)
    
    sent_lst = []
    for page_no, page in enumerate(doc):

        # Skip page
        if page_no+1 in skip_page: continue
        
        block_lst = page.get_text('blocks') #####
        text = get_Mytext(block_lst)
    
        for i, sentence in enumerate(sent_tokenize(text)):
            r_sent = ' '.join(sentence.split()) # Delete '\n', '\t' and strip
            sent_lst.append(r_sent)
            
    doc.close()

    return sent_lst

### Define Function: Generate document (DataFrame)

In [6]:
def gen_document(doc_id, fname, sent_lst):

    res_df = pd.DataFrame(
        {
            'doc_id': doc_id,
            'fname': fname,
            'sentence': sent_lst
        }
    )
    
    return res_df

In [7]:
def iterate_directories(top_directory):
    """This function iterates across 
    every directory from the top directory."""

    for root, dirs, files in os.walk(top_directory):
        print(f'Current directory: {root}')
        for file in files:
            file_path = os.path.join(root, file)

In [8]:
iterate_directories(path_pdf)

Current directory: ../Data/Input/SDGs
Current directory: ../Data/Input/SDGs/03_Equity
Current directory: ../Data/Input/SDGs/04_Social Development
Current directory: ../Data/Input/SDGs/05_Resources
Current directory: ../Data/Input/SDGs/06_Environments
Current directory: ../Data/Input/SDGs/01_Life
Current directory: ../Data/Input/SDGs/02_Economic and Technological Development


### Define Function: Read PDF file list

In [None]:
def read_filelist(path):

    # Create empty DataFrame
    df = pd.DataFrame()
    
    
    # Iterate through the sorted file list
    for idx, fname in enumerate(file_list):
    # Read file list (directory)
    # for idx, fname in enumerate(os.listdir(path)):
        p_fname = os.path.join(path, fname)
        print('path + fname >>>', p_fname)
        
        if p_fname.split('.')[-1] != 'pdf': continue # Check for 'pdf' extension
        print('fname >>>',fname)
    
        doc_id = int(idx)
        
        print(f'doc_id = [{doc_id}], fname = [{fname}]')
        print('')
    
        sent_lst = get_sentence(p_fname)
        df_doc   = gen_document(doc_id, fname, sent_lst)
        
        df = pd.concat([df,df_doc])
        
    return df

### Create DataFrame from PDF Files

In [None]:
%%time
df = read_filelist(path_pdf)
print('==== End of jobs ====')

In [None]:
df

### Export Data to CSV File

In [None]:
df.to_csv(fname_out, index=False)

In [None]:
# End of file