# Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from PyPDF2 import PdfFileReader, PdfFileMerger
import PyPDF2
from bs4 import BeautifulSoup
import os
import shutil

***There are several approaches to parse all the information out of the banks websites. In the following chapters the best one will be assessed***

# Finding the right document(s) on the websites

Since the Sources Website ```https://clientebancario.bportugal.pt/precarios``` doesn't provide enough information on the interest rates on the banks (Parte II of the document), there is another approach to get the prices directly from the bank's homepages. 

## setting up Bank dictionary with manual input

In [3]:
# init banks
banks = {}

In [4]:
## Building an initial dictionary which is maintained in the backoffice of the app via input field.
def input_urls(banks, name_bank, url):
    if 'http' in url:
        pass
    else:
        url = 'https://' + url
    url = url.strip("/")
    banks[name_bank] = {'url':url}
    return banks

In [5]:
# saving bank urls
bankdict = {'abanca':'https://www.abanca.pt',
'bic':'https://www.bancobic.ao',
'ctt': 'www.bancoctt.pt',
'bankinter':'https://www.bankinter.pt',
'bai':'https://www.bancobai.ao'
    }
## quickly filling up the banks // later it will be done by user input
for k, v in bankdict.items():
    input_urls(banks, k, v)


In [6]:
banks

{'abaca': {'url': 'https://www.abanca.pt'},
 'bic': {'url': 'https://www.bancobic.ao'},
 'ctt': {'url': 'https://www.bancoctt.pt'},
 'bankinter': {'url': 'https://www.bankinter.pt'},
 'bai': {'url': 'https://www.bancobai.ao'}}

## Finding the 'prices' page on the website

In [7]:
def find_price_pages(banks=banks, search='preçário'):    
    search = search.lower()
    for k, v in banks.items():
        url = v.get('url')
        print(f'parsing url: {url}')    
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        if r.status_code == 200:
            # going through every link on the page to see if 'precarios' is in the link
            for link in soup.find_all('a', href=True):
                lower = str(link.string).lower().strip()
                title = str(link.get('title')).strip().lower()
                if lower == search or title == search:
                    print(f'found search in link name: {lower}')
                    print(f'found search in title: {title}')
                    # some links in the source code are relative, some are absolute
                    if 'http' in str(link.get("href")):
                        print(f'adding link to banks: {link.get("href")}')
                        v['pricelist_url'] = f'{link.get("href")}'
                    else:
                        print(f'adding link to banks: {url}{link.get("href")}')
                        v['pricelist_url'] = f'{url}{link.get("href")}'
        else:
            print(f'could not reach page: {url}')
    return banks

In [8]:
banks = find_price_pages()

parsing url: https://www.abanca.pt
found search in link name: preçário
found search in title: none
adding link to banks: https://www.abanca.pt/pt/precario/
parsing url: https://www.bancobic.ao
found search in link name: preçário
found search in title: none
adding link to banks: https://www.bancobic.ao/inicio/precario
parsing url: https://www.bancoctt.pt
found search in link name: preçário
found search in title: none
adding link to banks: https://www.bancoctt.pt/application/themes/pdfs/precario.pdf?language_id=1555597541833
parsing url: https://www.bankinter.pt
found search in link name: none
found search in title: preçário
adding link to banks: https://www.bankinter.pt/precario1
parsing url: https://www.bancobai.ao
found search in link name: preçário
found search in title: none
adding link to banks: https://www.bancobai.ao/pt/preçário


## Downloading all pdfs from the source

In [9]:
def get_pdf_urls(banks=banks):
    for val in banks.values():
        url = val.get('pricelist_url')
        val['pdfs'] = list()
        # some banks direclty link to a pdf address
        if '.pdf' in url:
            print(f'url is already pdf for: {url}')
            val['pdfs'].append(f'{url}')
        # for other landing pages look for every pdf on page
        else:
            r = requests.get(url)
#             print(type(val['pdfs']))
            if r.status_code == 200:
                soup = BeautifulSoup(r.content, 'html.parser')
                print(f'looking for pdfs in: {url}')
                for link in soup.find_all('a', href=True):
                    if '.pdf' in link.get('href'):
                        # some pdf links are absolute links
                        if 'http' in link.get('href'):
                            pdf = link.get('href')
                        else:
                            pdf = f"{val.get('url')}{link.get('href')}"
                        print(f'found and added pdf: {pdf}')
                        val['pdfs'].append(f'{pdf}')
            else:
                print(f'could not execute parsing for: {url}')
    return banks

In [10]:
banks = get_pdf_urls()

looking for pdfs in: https://www.abanca.pt/pt/precario/
found and added pdf: https://www.abanca.pt/files/documents/precario-folheto-comissoes-823a7663.pdf
found and added pdf: https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf
found and added pdf: https://www.abanca.pt/files/documents/precariodevaloresmobiliarios-23c714a2.pdf
found and added pdf: https://www.abanca.pt/files/documents/glossario-6479e89f.pdf
found and added pdf: https://www.abanca.pt/files/docs/leaflet-your-rights-payments-eu.pdf
looking for pdfs in: https://www.bancobic.ao/inicio/precario
found and added pdf: https://www.bancobic.ao/dotAsset/0f07d8a4-4ec3-4111-a453-c4ee14d5c66c.pdf
found and added pdf: https://www.bancobic.ao/dotAsset/7e1e6dd2-9d56-43ef-b398-ac3bac3cc70e.pdf
url is already pdf for: https://www.bancoctt.pt/application/themes/pdfs/precario.pdf?language_id=1555597541833
looking for pdfs in: https://www.bankinter.pt/precario1
found and added pdf: https://www.bankinter.pt/documents

In [11]:
# banks

In [16]:
def download_pdfs(banks=banks, output_folder='../raw_data/'):
    for key, val in banks.items():
        print(f'handling pdfs from {key}')
        pdfs = val.get('pdfs')
        len_pdfs = len(pdfs)
        temp_dir = f'{output_folder}{key}_temp'
        print(f'opening temp folder {temp_dir}')
        shutil.rmtree(temp_dir, ignore_errors=True)
        os.mkdir(f'{temp_dir}')
        print(f'downloading all pdfs from list {pdfs}')
        for count, url in enumerate(pdfs):
            r = requests.get(url)
            if r.status_code == 200:
                with open(f'{temp_dir}/{key}{count}.pdf', 'wb') as f:
                    f.write(r.content) # saving
            else:
                print(f'could not download pdf: {url}')
        print(f'starting the pdf merger')
        merger = PdfFileMerger()
        # checking which files were downloaded
        pdf_files = sorted([f"{temp_dir}/{x}" for x in os.listdir(f'{temp_dir}')])
        print(f'found these downloaded files: {pdf_files}')
        
        # decrypting all files
        for file in pdf_files:
            pdfFile = PdfFileReader(file)
            if pdfFile.isEncrypted:
                try:
                    pdfFile = pdfFile.decrypt('')
                    print(f'{file} - File Decrypted (with PyPDF2)')
                except:
                    command = ("cp "+ file +
                        " temp.pdf; qpdf --password='' --decrypt temp.pdf " + file
                        + "; rm temp.pdf")
                    os.system(command)
                    print(f'{file} - File Decrypted (with qpdf)')
        print(f'merging all files for {key}')
        # loop through all files as opened objects
        for pdf in [open(x, 'rb') for x in pdf_files]:
            merger.append(pdf)
        filename = f"{output_folder}{key}_pricelists.pdf"
        merger.write(filename)
        print(f'saved concat pdf to {filename}')
        merger.close()

In [17]:
download_pdfs()

handling pdfs from abaca
opening temp folder ../raw_data/abaca_temp
downloading all pdfs from list ['https://www.abanca.pt/files/documents/precario-folheto-comissoes-823a7663.pdf', 'https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf', 'https://www.abanca.pt/files/documents/precariodevaloresmobiliarios-23c714a2.pdf', 'https://www.abanca.pt/files/documents/glossario-6479e89f.pdf', 'https://www.abanca.pt/files/docs/leaflet-your-rights-payments-eu.pdf']
could not download pdf: https://www.abanca.pt/files/documents/glossario-6479e89f.pdf
starting the init pdf merger
found these downloaded files: ['../raw_data/abaca_temp/abaca2.pdf', '../raw_data/abaca_temp/abaca1.pdf', '../raw_data/abaca_temp/abaca4.pdf', '../raw_data/abaca_temp/abaca0.pdf']
../raw_data/abaca_temp/abaca1.pdf - File Decrypted (with qpdf)




../raw_data/abaca_temp/abaca0.pdf - File Decrypted (with qpdf)
merging all files for abaca
saved concat pdf to ../raw_data/abaca_pricelists.pdf
handling pdfs from bic
opening temp folder ../raw_data/bic_temp
downloading all pdfs from list ['https://www.bancobic.ao/dotAsset/0f07d8a4-4ec3-4111-a453-c4ee14d5c66c.pdf', 'https://www.bancobic.ao/dotAsset/7e1e6dd2-9d56-43ef-b398-ac3bac3cc70e.pdf']
starting the init pdf merger
found these downloaded files: ['../raw_data/bic_temp/bic0.pdf', '../raw_data/bic_temp/bic1.pdf']
merging all files for bic
saved concat pdf to ../raw_data/bic_pricelists.pdf
handling pdfs from ctt
opening temp folder ../raw_data/ctt_temp
downloading all pdfs from list ['https://www.bancoctt.pt/application/themes/pdfs/precario.pdf?language_id=1555597541833']
starting the init pdf merger
found these downloaded files: ['../raw_data/ctt_temp/ctt0.pdf']
merging all files for ctt
saved concat pdf to ../raw_data/ctt_pricelists.pdf
handling pdfs from bankinter
opening temp folde







saved concat pdf to ../raw_data/bankinter_pricelists.pdf
handling pdfs from bai
opening temp folder ../raw_data/bai_temp
downloading all pdfs from list ['https://www.bancobai.ao/media/2870/precario_clientes-particulares_08_02_2021.pdf', 'https://www.bancobai.ao/media/2871/precario_outros-clientes_08_02_2021.pdf', 'https://www.bancobai.ao/media/2823/bai_resumo-das-alteracoes-do-precario-bna-8-02-2021-cleaned.pdf', 'https://www.bancobai.ao/media/2278/termos-e-condições.pdf']
starting the init pdf merger
found these downloaded files: ['../raw_data/bai_temp/bai2.pdf', '../raw_data/bai_temp/bai3.pdf', '../raw_data/bai_temp/bai0.pdf', '../raw_data/bai_temp/bai1.pdf']
merging all files for bai
saved concat pdf to ../raw_data/bai_pricelists.pdf


# Downloading Files manually

In [14]:
bank_pdfs = {'BAI':{'id_pb':'0008',
                'url':'https://www.bancobaieuropa.pt/',
                'prices':['https://www.bancobai.ao/media/2870/precario_clientes-particulares_08_02_2021.pdf',
                         'https://www.bancobai.ao/media/2871/precario_outros-clientes_08_02_2021.pdf']
               },
       'ABANCA':{'id_pb':'0170',
                'url':'https://www.abanca.pt',
                'prices':['https://www.abanca.pt/files/documents/precario-folheto-comissoes-a6b9545f.pdf',
                         'https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf'
                         ]
                },
       'BIC':{'id_pb':'0079',
                'url':'https://www.bancobic.ao',
                'prices':['https://www.bancobic.ao/dotAsset/0f07d8a4-4ec3-4111-a453-c4ee14d5c66c.pdf',
                         'https://www.bancobic.ao/dotAsset/7e1e6dd2-9d56-43ef-b398-ac3bac3cc70e.pdf'
                         ]
             },
       'CTT':{'id_pb':'0193',
                'url':'https://www.bancoctt.pt',
                'prices':['https://www.bancoctt.pt/application/themes/pdfs/precario.pdf'
                         ]
             },
       'BANKINTER':{'id_pb':'0269',
                'url':'https://www.bankinter.pt',
                'prices':[# Parte I
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_contas_deposito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_operacoes_credito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cartoes_credito_debito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cheques.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_transferencias.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cobrancas.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_prestacao_servicos.pdf',
                         ## Parte II
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_contas_deposito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_operacoes_credito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cartoes_credito_debito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cheques.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_transferencias.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cobrancas.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_prestacao_servicos.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_operacoes_estrangeiro.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_ftj_contas_deposito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_ftj_operacoes_credito.pdf',
                        ]
                   }
        }

In [15]:
## not further investigated

# old scraping form banco de portugal

Every PDF file has the same name structure ```'https://clientebancario.bportugal.pt/sites/default/files/precario/[BANK_ID]_/[BANK_ID]_PRE.pdf'```
We can use requests to load and save the pdf files locally.
Banks have a unique ID which are stored in the ```banks``` dictionary.

In [None]:
## Defining url prefix and suffix
url_pre = 'https://clientebancario.bportugal.pt/sites/default/files/precario/'
url_suff = '_PRE.pdf'

In [None]:
# scraping the latest files from the bankpage and saving it to ```raw_data```
for k, v in banks.items():
    r = requests.get(f'{url_pre}{v}_/{v}{url_suff}')
    with open(f'../raw_data/{k}.pdf', 'wb') as f:
        f.write(response.content)
    

The files can be inspected by the PyPDF2 library.

In [None]:
# example path
path = '../raw_data/CTT.pdf'

In [None]:
def text_extractor(path):
    fp = open(path, 'rb')
    pdfFile = PdfFileReader(fp)
    if pdfFile.isEncrypted:
        try:
            pdfFile = pdfFile.decrypt('')
            print('File Decrypted (PyPDF2)')
        except:
            command = ("cp "+ path +
                " temp.pdf; qpdf --password='' --decrypt temp.pdf " + path
                + "; rm temp.pdf")
            os.system(command)
            print('File Decrypted (qpdf)')
            fp = open(path)
            print(fp)
            pdfFile = PdfFileReader(fp)
    else:
        print('File Not Encrypted')
    
    pdfFile = PdfFileReader(fp)
#     meta = pdfFile.getXmpMetadata()
#     print(meta)
#     mode = pdfFile.getFields()
#     print(mode)
    page = pdfFile.getPage(4)
    text = page.extractText()
    return text.split("\n")


In [None]:
df = pd.read_html('https://en.wikipedia.org/wiki/Minnesota')

In [None]:
PDFFile = open(path,'rb')

PDF = PyPDF2.PdfFileReader(PDFFile)
pages = PDF.getNumPages()
key = '/Annots'
uri = '/URI'
ank = '/A'

for page in range(pages):
    pageSliced = PDF.getPage(page)
    pageObject = pageSliced.getObject()

    if pageObject.has_key(key):
        ann = pageObject[key]
        for a in ann:
            u = a.getObject()
            if u[ank].has_key(uri):
                print(u[ank][uri])