# Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from PyPDF2 import PdfFileReader, PdfFileMerger, PdfFileWriter
import PyPDF2
from bs4 import BeautifulSoup
import os
import shutil
from urllib.parse import urljoin
from urllib.request import Request, urlopen
from io import StringIO, BytesIO
import cloudinary.uploader
import json

# Finding the right document(s) on the websites

Since the Sources Website ```https://clientebancario.bportugal.pt/precarios``` doesn't provide enough information on the interest rates on the banks (Parte II of the document), there is another approach to get the prices directly from the bank's homepages. 

## setting up Bank dictionary with manual input

In [10]:
# init banks
banks = {}

In [11]:
## Building an initial dictionary which is maintained in the backoffice of the app via input field.
def input_urls(banks, name_bank, url):
    if 'http' in url:
        pass
    else:
        url = 'https://' + url
    url = url.strip("/")
    banks[name_bank] = {'url':url}
    return banks

In [12]:
# saving bank urls
bankdict = {'abanca':'https://www.abanca.pt',
'bic':'https://www.bancobic.ao',
'ctt': 'www.bancoctt.pt',
'bankinter':'https://www.bankinter.pt',
# 'bai':'https://www.bancobai.ao'
    }
## quickly filling up the banks // later it will be done by user input
for k, v in bankdict.items():
    input_urls(banks, k, v)


In [13]:
banks

{'abanca': {'url': 'https://www.abanca.pt'},
 'bic': {'url': 'https://www.bancobic.ao'},
 'ctt': {'url': 'https://www.bancoctt.pt'},
 'bankinter': {'url': 'https://www.bankinter.pt'}}

## Finding the 'prices' page on the website

Once the rails app has provided our API with the website of the bank, we can start looking for the pricelists of the bank on their website.

In [6]:
def find_price_pages(banks=banks, search=['preçário', 'pricelist', 'precario']):    
    search = [x.lower() for x in search]
    for k, v in banks.items():
        url = v.get('url')
        print(f'parsing url: {url}')
        # only look for the pt page
        headers = {'Accept-Language': 'pt-PT'}
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.content, 'html.parser')
        if r.status_code == 200:
            # going through every link on the page to see if 'precarios' is in the link
            for link in soup.find_all('a', href=True):
                url_prices = str(link.get('href').lower().strip())
                lower = str(link.string).lower().strip()
                title = str(link.get('title')).strip().lower()
                searchstring = ' '.join([url_prices, lower, title])
                if any([x in searchstring for x in search]):
                    print(f'found terms of {search} in string {searchstring}')
                    # some links in the source code are relative, some are absolute -- using urljoin
                    v['pricelist_url'] = urljoin(url,url_prices)
                    print(f'added link to banks: {urljoin(url,url_prices)}')
        else:
            print(f'could not reach page: {url}')
    return banks

In [7]:
banks = find_price_pages()
# ubdates the banks dict with the pricelist urls of the banks

parsing url: https://www.abanca.pt
found terms of ['preçário', 'pricelist', 'precario'] in string /pt/precario/ preçário none
added link to banks: https://www.abanca.pt/pt/precario/
parsing url: https://www.bancobic.ao
found terms of ['preçário', 'pricelist', 'precario'] in string /inicio/precario preçário none
added link to banks: https://www.bancobic.ao/inicio/precario
parsing url: https://www.bancoctt.pt
found terms of ['preçário', 'pricelist', 'precario'] in string https://www.bancoctt.pt/application/themes/pdfs/precario.pdf?language_id=1555597541833 preçário none
added link to banks: https://www.bancoctt.pt/application/themes/pdfs/precario.pdf?language_id=1555597541833
parsing url: https://www.bankinter.pt
found terms of ['preçário', 'pricelist', 'precario'] in string https://www.bankinter.pt/precario1 none preçário
added link to banks: https://www.bankinter.pt/precario1
parsing url: https://www.bancobai.ao
found terms of ['preçário', 'pricelist', 'precario'] in string /pt/preçári

In [8]:
# banks

## doing stuff with the sources

To download all PDFs we need to find their urls on the prices page on the websites

In [8]:
## todo: checkout PDF miner

def get_pdf_urls(banks=banks):
    for val in banks.values():
        url = val.get('pricelist_url')
        val['pdfs'] = list()
        # some banks direclty link to a pdf address
        if '.pdf' in url:
            print(f'url is already pdf for: {url}')
            val['pdfs'].append(f'{url}')
        # for other landing pages look for every pdf on page
        else:
            r = requests.get(url)
#             print(type(val['pdfs']))
            if r.status_code == 200:
                soup = BeautifulSoup(r.content, 'html.parser')
                print(f'looking for pdfs in: {url}')
                for link in soup.find_all('a', href=True):
                    if '.pdf' in link.get('href'):
                        # some pdf links are absolute links
                        ## tru out r.url
                        pdf = urljoin(val.get('url'),link.get('href'))
                        print(f'found and added pdf: {pdf}')
                        val['pdfs'].append(f'{pdf}')
            else:
                print(f'could not execute parsing for: {url}')
    return banks

In [9]:
banks = get_pdf_urls()
# adds all pdf urls to the banks dict

looking for pdfs in: https://www.abanca.pt/pt/precario/
found and added pdf: https://www.abanca.pt/files/documents/precario-folheto-comissoes-823a7663.pdf
found and added pdf: https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf
found and added pdf: https://www.abanca.pt/files/documents/precariodevaloresmobiliarios-23c714a2.pdf
found and added pdf: https://www.abanca.pt/files/documents/glossario-6479e89f.pdf
found and added pdf: https://www.abanca.pt/files/docs/leaflet-your-rights-payments-eu.pdf
looking for pdfs in: https://www.bancobic.ao/inicio/precario
found and added pdf: https://www.bancobic.ao/dotAsset/0f07d8a4-4ec3-4111-a453-c4ee14d5c66c.pdf
found and added pdf: https://www.bancobic.ao/dotAsset/7e1e6dd2-9d56-43ef-b398-ac3bac3cc70e.pdf
url is already pdf for: https://www.bancoctt.pt/application/themes/pdfs/precario.pdf?language_id=1555597541833
looking for pdfs in: https://www.bankinter.pt/precario1
found and added pdf: https://www.bankinter.pt/documents

In [11]:
# banks

Now we can download all the pdfs, merge them and save them in one file. Later they will be handed over to the scraping script. If you run this scrpit, make sure, you have a ```raw_data``` folder in the package.

In [12]:
def store_pdfs(banks=banks, #secrets missing):
    for key, val in banks.items():
            print(f'handling pdfs from {key}')
            pdfs = val.get('pdfs')
            len_pdfs = len(pdfs)
            merger = PdfFileMerger()
            for pdf in pdfs:
                print(f'looking for: {pdf}')
                # some files are not available => throws a 404
                try:
                    remoteFile = urlopen(Request(pdf)).read()
                except:
                    break
                # storing pdf in virtual memory as an object
                memoryFile = BytesIO(remoteFile)
                pdfFile = PdfFileReader(memoryFile)

                # checking for encryption
                if pdfFile.isEncrypted:
                    try:
                        pdfFile = pdfFile.decrypt('')
                        print(f'{pdf} - File Decrypted (with PyPDF2)')
                        merger.append(pdfFile)
                        print(f'appended file to merger with decrypt: {pdf}') 
                        
                    except:
                        print(f'no method found to deccrypt: {pdf}')
#     #                 except:
    #                     command = ("cp "+ pdf +
    #                         " temp.pdf; qpdf --password='' --decrypt temp.pdf " + pdf
    #                         + "; rm temp.pdf")
    #                     os.system(command)
    #                     print(f'{pdf} - File Decrypted (with qpdf)')
    #                     merger.append(memoryFile)
    #                     print(f'appended file to merger with qpdf decrypt: {pdf}')
                    
                else:
                    merger.append(pdfFile)
                    print(f'appended non encrypted file to merger: {pdf}') 
                        

            # saving file on cloudinary
            filename = f'../raw_data/{key}_all_prices.pdf'
            merger.write(filename)
            
            try:
                cloudinary.uploader.upload(filename, 
                  use_filename = 1, 
                  unique_filename = 0, 
                  folder = "pdfs", 
                  public_id = f'{key}_all_prices.pdf', 
                  api_key = api,
                  api_secret = secret,
                  cloud_name = cloud_name)

    #             filename = f'{key}_all_prices.pdf'
    #             merger.write(filename)
                cloud_dir = f'https://res.cloudinary.com/do59ghg7e/image/upload/v1615834323/pdfs/{key}_all_prices.pdf'
                print(f'saved merged file to: {cloud_dir}')
            except:
                print(f'could not upload file to cloudinary: {filename}')

            # adding address to banks
            val['cloud_pdf'] = cloud_dir

    return banks 

In [13]:
banks = store_pdfs()

handling pdfs from abanca
looking for: https://www.abanca.pt/files/documents/precario-folheto-comissoes-823a7663.pdf
no method found to deccrypt: https://www.abanca.pt/files/documents/precario-folheto-comissoes-823a7663.pdf
looking for: https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf
no method found to deccrypt: https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf
looking for: https://www.abanca.pt/files/documents/precariodevaloresmobiliarios-23c714a2.pdf
looking for: https://www.abanca.pt/files/documents/glossario-6479e89f.pdf
saved merged file to: couldinary
handling pdfs from bic
looking for: https://www.bancobic.ao/dotAsset/0f07d8a4-4ec3-4111-a453-c4ee14d5c66c.pdf
looking for: https://www.bancobic.ao/dotAsset/7e1e6dd2-9d56-43ef-b398-ac3bac3cc70e.pdf
saved merged file to: couldinary
handling pdfs from ctt
looking for: https://www.bancoctt.pt/application/themes/pdfs/precario.pdf?language_id=1555597541833
saved merged file to: cou



looking for: https://banco.bankinter.pt/particulares/pdfs/precario/informacao_geral.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/informacao_complementar.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_contas_deposito.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_operacoes_credito.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cartoes_credito_debito.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cheques.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_transferencias.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cobrancas.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_prestacao_servicos.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_contas_deposito.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_operacoes_credito.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cartoes_credito_debito.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cheques.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_transferencias.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cobrancas.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_prestacao_servicos.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_operacoes_estrangeiro.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_ftj_contas_deposito.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/p_ftj_operacoes_credito.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_ftj_contas_deposito.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/o_ftj_operacoes_credito.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/ptfs_a.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/ptfs_b.pdf
looking for: https://banco.bankinter.pt/particulares/pdfs/precario/ptfs_c.pdf




looking for: https://banco.bankinter.pt/particulares/pdfs/precario/ptfs_d.pdf






could not upload file to cloudinary: ../raw_data/bankinter_all_prices.pdf
handling pdfs from bai
looking for: https://www.bancobai.ao/media/2988/bai_pre_clientes-particulares_31-03-2021.pdf
looking for: https://www.bancobai.ao/media/2989/bai_pre_outros-clientes_31-03-2021.pdf
looking for: https://www.bancobai.ao/media/2823/bai_resumo-das-alteracoes-do-precario-bna-8-02-2021-cleaned.pdf
looking for: https://www.bancobai.ao/media/2278/termos-e-condições.pdf
saved merged file to: couldinary


In [14]:
with open('../raw_data/banks.json', 'w') as fp:
    json.dump(banks, fp)

In [19]:
banks

{'abanca': {'url': 'https://www.abanca.pt',
  'pricelist_url': 'https://www.abanca.pt/pt/precario/',
  'pdfs': ['https://www.abanca.pt/files/documents/precario-folheto-comissoes-823a7663.pdf',
   'https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf',
   'https://www.abanca.pt/files/documents/precariodevaloresmobiliarios-23c714a2.pdf',
   'https://www.abanca.pt/files/documents/glossario-6479e89f.pdf',
   'https://www.abanca.pt/files/docs/leaflet-your-rights-payments-eu.pdf'],
  'cloud_pdf': 'https://res.cloudinary.com/do59ghg7e/image/upload/v1615834323/pdfs/abanca_all_prices.pdf'},
 'bic': {'url': 'https://www.bancobic.ao',
  'pricelist_url': 'https://www.bancobic.ao/inicio/precario',
  'pdfs': ['https://www.bancobic.ao/dotAsset/0f07d8a4-4ec3-4111-a453-c4ee14d5c66c.pdf',
   'https://www.bancobic.ao/dotAsset/7e1e6dd2-9d56-43ef-b398-ac3bac3cc70e.pdf'],
  'cloud_pdf': 'https://res.cloudinary.com/do59ghg7e/image/upload/v1615834323/pdfs/bic_all_prices.pdf'},
 'ctt':

***The following part is not needed at the moment***

# reacting to requests

In [16]:
# key is bank id
    banks = {'1':{'url':"https://www.abanca.pt", 
                    "name":"ABANCA Corporación Bancaria, S.A.", 
                    "num_pdfs":"3", 
                    "last_updated":"210401", 
                    "sum_sizes":"40", 
                    "bp_bank_id":"0170"}
             ,
              '2':{'url':"https://www.abanca.pt", 
                    "name":"ABANCA Corporación Bancaria, S.A.", 
                    "num_pdfs":"3", 
                    "last_updated":"210401", 
                    "sum_sizes":"40", 
                    "bp_bank_id":"0170"}
              ,
              '3':{'url':"https://www.abanca.pt", 
                    "name":"ABANCA Corporación Bancaria, S.A.", 
                    "num_pdfs":"3", 
                    "last_updated":"210401", 
                    "sum_sizes":"40", 
                    "bp_bank_id":"0170"}
              }

In [17]:
for bank_id, vals in banks.items():
    print(bank_id)

1
2
3


# Downloading Files manually

In [15]:
bank_pdfs = {'BAI':{'id_pb':'0008',
                'url':'https://www.bancobaieuropa.pt/',
                'prices':['https://www.bancobai.ao/media/2870/precario_clientes-particulares_08_02_2021.pdf',
                         'https://www.bancobai.ao/media/2871/precario_outros-clientes_08_02_2021.pdf']
               },
       'ABANCA':{'id_pb':'0170',
                'url':'https://www.abanca.pt',
                'prices':['https://www.abanca.pt/files/documents/precario-folheto-comissoes-a6b9545f.pdf',
                         'https://www.abanca.pt/files/documents/folheto-taxa-juro-precario-23836e90.pdf'
                         ]
                },
       'BIC':{'id_pb':'0079',
                'url':'https://www.bancobic.ao',
                'prices':['https://www.bancobic.ao/dotAsset/0f07d8a4-4ec3-4111-a453-c4ee14d5c66c.pdf',
                         'https://www.bancobic.ao/dotAsset/7e1e6dd2-9d56-43ef-b398-ac3bac3cc70e.pdf'
                         ]
             },
       'CTT':{'id_pb':'0193',
                'url':'https://www.bancoctt.pt',
                'prices':['https://www.bancoctt.pt/application/themes/pdfs/precario.pdf'
                         ]
             },
       'BANKINTER':{'id_pb':'0269',
                'url':'https://www.bankinter.pt',
                'prices':[# Parte I
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_contas_deposito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_operacoes_credito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cartoes_credito_debito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cheques.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_transferencias.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_cobrancas.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/p_fcd_prestacao_servicos.pdf',
                         ## Parte II
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_contas_deposito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_operacoes_credito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cartoes_credito_debito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cheques.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_transferencias.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_cobrancas.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_prestacao_servicos.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_fcd_operacoes_estrangeiro.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_ftj_contas_deposito.pdf',
                         'https://banco.bankinter.pt/particulares/pdfs/precario/o_ftj_operacoes_credito.pdf',
                        ]
                   }
        }

In [16]:
## not further investigated

# old scraping form banco de portugal

Every PDF file has the same name structure ```'https://clientebancario.bportugal.pt/sites/default/files/precario/[BANK_ID]_/[BANK_ID]_PRE.pdf'```
We can use requests to load and save the pdf files locally.
Banks have a unique ID which are stored in the ```banks``` dictionary.

In [17]:
## Defining url prefix and suffix
url_pre = 'https://clientebancario.bportugal.pt/sites/default/files/precario/'
url_suff = '_PRE.pdf'

In [18]:
# scraping the latest files from the bankpage and saving it to ```raw_data```
for k, v in banks.items():
    r = requests.get(f'{url_pre}{v}_/{v}{url_suff}')
    with open(f'../raw_data/{k}.pdf', 'wb') as f:
        f.write(response.content)
    

NameError: name 'response' is not defined

The files can be inspected by the PyPDF2 library.

In [None]:
# example path
path = '../raw_data/CTT.pdf'

In [None]:
def text_extractor(path):
    fp = open(path, 'rb')
    pdfFile = PdfFileReader(fp)
    if pdfFile.isEncrypted:
        try:
            pdfFile = pdfFile.decrypt('')
            print('File Decrypted (PyPDF2)')
        except:
            command = ("cp "+ path +
                " temp.pdf; qpdf --password='' --decrypt temp.pdf " + path
                + "; rm temp.pdf")
            os.system(command)
            print('File Decrypted (qpdf)')
            fp = open(path)
            print(fp)
            pdfFile = PdfFileReader(fp)
    else:
        print('File Not Encrypted')
    
    pdfFile = PdfFileReader(fp)
#     meta = pdfFile.getXmpMetadata()
#     print(meta)
#     mode = pdfFile.getFields()
#     print(mode)
    page = pdfFile.getPage(4)
    text = page.extractText()
    return text.split("\n")


In [None]:
df = pd.read_html('https://en.wikipedia.org/wiki/Minnesota')

In [None]:
PDFFile = open(path,'rb')

PDF = PyPDF2.PdfFileReader(PDFFile)
pages = PDF.getNumPages()
key = '/Annots'
uri = '/URI'
ank = '/A'

for page in range(pages):
    pageSliced = PDF.getPage(page)
    pageObject = pageSliced.getObject()

    if pageObject.has_key(key):
        ann = pageObject[key]
        for a in ann:
            u = a.getObject()
            if u[ank].has_key(uri):
                print(u[ank][uri])