# Test data reading

In [None]:
import boto3
import re
from io import BytesIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from docx import Document
from pptx import Presentation

In [None]:
bucket_name = 'alpha-app-hr-policy'
fpath = 'gdd_capability/gdd_capability_pay/'

In [None]:

def get_file_list(bucket_name, prefix='docs/'):
    """
    Get a list of pdfs stored in a folder of an s3 bucket.

    Arguments:
     - bucket_name: this is the name of the s3 bucket where the pdfs
                    are stored.
     - prefix: this is the name of the folder within the s3 bucket where
               the pdfs are stored.

    Returns:
    A list of file paths for where the PDFs are stored.

    """

    my_bucket = boto3.resource('s3').Bucket(bucket_name)

    file_list = [object_summary.key for object_summary in my_bucket.objects.filter(Prefix=prefix)]

    return file_list

In [None]:
file_list = get_file_list(bucket_name, prefix=fpath)

In [None]:
file_list

In [None]:
pdf_list = [file for file in file_list if re.search('.pdf$', file)]
pdf_list

In [None]:
docx_list = [file for file in file_list if re.search('.doc$|.docx$', file)]
docx_list

In [None]:
ppt_list = [file for file in file_list if re.search('.ppt$|.pptx$', file)]
ppt_list

## PDF reading

In [None]:
obj = boto3.resource('s3').Object(bucket_name, pdf_list[0])
fs = obj.get()['Body'].read()
# Define some objects required by the pdf reader
laparams = LAParams()
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

with BytesIO(fs) as f:

    # Define generator from which pages can be read
    page_gen = PDFPage.get_pages(f)
    
    # Extract the text
    interpreter.process_page(list(page_gen)[1])
    data = retstr.getvalue()
    body = data.decode('utf-8')

#             # Replace weird sets of characters that get introduced by the pdf reader
#             # when it can't interpret a punctuation mark
#             for match, repl in replace_dict.items():
#                 body = body.replace(match, repl)

    # Replace multiple spaces with single space
    body = re.sub(r'\s+', ' ', body.strip())

retstr.close()
device.close()

In [None]:
print(body)

## Read Word doc

In [None]:
fname = docx_list[2]
obj = boto3.resource('s3').Object(bucket_name, fname)
fs = obj.get()['Body'].read()

with BytesIO(fs) as f:
    document = Document(f)

print(fname)

In [None]:
all_paras = document.paragraphs
for para in all_paras[:10]:
    print(para.text)
    print(para.style.name)

#### Word doc with a table

In [None]:
fname = 'gdd_capability/gdd_capability_pay/Learning and Development FAQs.docx'
obj = boto3.resource('s3').Object(bucket_name, fname)
fs = obj.get()['Body'].read()

with BytesIO(fs) as f:
    document = Document(f)

print(fname)

In [None]:
all_paras = document.paragraphs
for para in all_paras[:10]:
    print(para.text)
    print(para.style.name)

In [None]:
tables = document.tables

In [None]:
for i, row in enumerate(tables[0].rows):
    text = ' - '.join([cell.text for cell in row.cells if cell.text != ''])
    if text != '':
        print(text)

## Read powerpoint

In [None]:
# fname = ppt_list[0]
fname = 'gdd_capability/gdd_capability_pay/pay-and-allowances-manual.pptx'

In [None]:
obj = boto3.resource('s3').Object(bucket_name, fname)
fs = obj.get()['Body'].read()

with BytesIO(fs) as f:
    ppt = Presentation(f)

# text_runs will be populated with a list of strings,
# one for each text run in presentation
text_runs = []

for ii, slide in enumerate(ppt.slides):
    # Skip the first slide, since it's just a title slide
    if ii == 0: continue
    slide_text = []
    for shape in slide.shapes:
        if shape.has_text_frame:
            for para in shape.text_frame.paragraphs:
                para_text = ' '.join([run.text for run in para.runs])
                para_text = re.sub(r'\s+', ' ', para_text).strip()
                slide_text.append(para_text)
    
        # Iterate over any tables
        if shape.has_table:
            for row in shape.table.rows:
                table_text = ' - '.join([cell.text for cell in row.cells if cell.text != ''])
                if table_text != '':
                    slide_text.append(table_text)
    
    slide_text = '\n'.join(slide_text)
    if slide_text == '':
        # Skip slides with no text
        continue
    elif re.search('^contents|\ncontents$', slide_text.lower()):
        # Skip contents page slides
        continue
    elif len(slide_text.split(' ')) <= 8:
        # Skip what is likely to be a title slide
        continue
    else:
        print("----------------------")
        print(f"Slide number: {ii+1}")
        print(slide_text)

In [None]:
text_runs