# 01_Extract Text (from PDF files)

### Impoart Libraries

In [None]:
import numpy as np
import pandas as pd

In [None]:
import os
from nltk.tokenize import sent_tokenize

In [None]:
import fitz

### Define File Paths

In [None]:
path_pdf  = 'data_pdf/reports'
fname_out = 'data/sentences.csv'

### Define Function: Get count of words (only alphabets)

In [None]:
def get_cnt(text):
    cnt = 0
    for word in text.split():
        if word.isalpha():
            cnt += 1
    return cnt

### Define Function: Get text from blocks

In [None]:
def get_text(block_lst):

    MIN_WORD_CNT = 10
    
    text_lst = []
    for block in block_lst:
        if block[6] != 0: continue # block_type: 0 = text
    
        text = block[4]
        text = text.replace('fi ', 'fi') # PyMuPDF(fitz) bug fix: 'fi ' --> 'fi'
    
        if get_cnt(text) < MIN_WORD_CNT: continue # Delete sentences with less than MIN_WORD_CNT(10) 
    
        text_lst.append(text.replace('-\n', ''))
        
    return ('\n'.join(text_lst))

### Define Function: Get sentences from PDF files

In [None]:
def get_sentence(fname, skip_page = (0,)):
 
    doc = fitz.open(fname)
    
    sent_lst = []
    for page_no, page in enumerate(doc):

        # Skip page
        if page_no+1 in skip_page: continue
        
        block_lst = page.getText('blocks')
        text = get_text(block_lst)
    
        for i, sentence in enumerate(sent_tokenize(text)):
            r_sent = ' '.join(sentence.split()) # Delete '\n', '\t' and strip
            sent_lst.append(r_sent)
            
    doc.close()

    return sent_lst

### Define Function: Generate document (DataFrame)

In [None]:
def gen_document(doc_id, fname, sent_lst):

    res_df = pd.DataFrame(
        {
            'doc_id': doc_id,
            'fname': fname,
            'sentence': sent_lst
        }
    )
    
    return res_df

### Define Function: Read PDF file list

In [None]:
def read_filelist(path):

    # Create empty DataFrame
    df = pd.DataFrame()
    
    # Read file list (directory)
    for idx, fname in enumerate(os.listdir(path)):
        p_fname = os.path.join(path, fname)
        print('path + fname >>>', p_fname)
        
        if p_fname.split('.')[-1] != 'pdf': continue
        print('fname >>>',fname)
    
        doc_id = int(idx)
        
        print(f'doc_id = [{doc_id}], fname = [{fname}]')
        print('')
    
        sent_lst = get_sentence(p_fname)
        df_doc   = gen_document(doc_id, fname, sent_lst)
        
        df = pd.concat([df,df_doc])
        
    return df

### Create DataFrame from PDF Files

In [None]:
%%time
df = read_filelist(path_pdf)
print('==== End of jobs ====')

In [None]:
df

### Export Data to CSV File

In [None]:
df.to_csv(fname_out, index=False)

---

In [None]:
# End of file