# Overview

This code downloads the choosen PDFs from a website and transforms it into a TXT file

## Libraries

In [4]:
# Importing required libraries
!pip install pdfminer.six
!pip install requests beautifulsoup4
!pip install selenium

import requests
from bs4 import BeautifulSoup
import urllib.request
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re

Collecting pdfminer.six
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
                                              0.0/5.6 MB ? eta -:--:--
     --------                                 1.2/5.6 MB 26.1 MB/s eta 0:00:01
     ---------------------                    3.0/5.6 MB 31.8 MB/s eta 0:00:01
     ------------------------------           4.3/5.6 MB 34.5 MB/s eta 0:00:01
     ---------------------------------------  5.6/5.6 MB 35.9 MB/s eta 0:00:01
     ---------------------------------------- 5.6/5.6 MB 30.0 MB/s eta 0:00:00
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20221105


## Donwloading the PDFs

In [2]:
url = 'http://ri-vale.mz-sites.com/informacoes-para-o-mercado/relatorios-anuais/relatorios-de-sustentabilidade/'

# Set up the Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in background without opening a browser window
driver = webdriver.Chrome(options=options)

# Navigate to the webpage
driver.get(url)

# Wait for the page to load (adjust the sleep time as necessary)
time.sleep(5)

# Find all the tr elements, filter them based on title text, and store their href attributes and titles in a list
documents = []
rows = driver.find_elements(By.CSS_SELECTOR, 'tr')
for row in rows:
    title_element = row.find_element(By.CSS_SELECTOR, 'td.title span')
    title_text = title_element.text
    if "relato integrado" in title_text.lower():
        href = row.find_element(By.CSS_SELECTOR, 'td.download a[href]').get_attribute('href')
        documents.append((title_text, href))

print(documents)

# Close the browser window
driver.quit()

# Download the PDFs from the hrefs
download_directory = r'C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2'
if not os.path.exists(download_directory):
    os.makedirs(download_directory)

for title, href in documents:
    # Create a valid filename from the title
    filename = re.sub(r'[\\/:"*?<>|]+', '', title) + '.pdf'
    file_path = os.path.join(download_directory, filename)

    response = requests.get(href, stream=True)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f'Downloaded: {file_path}')
    else:
        print(f'Failed to download from {href}. Status code: {response.status_code}')

[('Relato Integrado 2022', 'https://api.mziq.com/mzfilemanager/v2/d/53207d1c-63b4-48f1-96b7-19869fae19fe/c5492e73-1aa8-251e-a6c4-c66cdb474f9f?origin=1'), ('Relato Integrado 2021', 'https://api.mziq.com/mzfilemanager/v2/d/53207d1c-63b4-48f1-96b7-19869fae19fe/565d6188-78a5-44f2-a97d-10000e022116?origin=1'), ('Relato Integrado 2020', 'https://api.mziq.com/mzfilemanager/v2/d/53207d1c-63b4-48f1-96b7-19869fae19fe/30a864aa-8ea1-4176-b69f-976c138fcdaa?origin=1')]
Downloaded: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2022.pdf
Downloaded: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2021.pdf
Downloaded: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2020.pdf


## Transforming the PDFs into TXT 

In [6]:
import os
from pdfminer.high_level import extract_text

download_directory = r'C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2'

# Convert PDFs to TXT files and remove the PDF files
for filename in os.listdir(download_directory):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(download_directory, filename)
        txt_filename = filename.replace('.pdf', '.txt')
        txt_path = os.path.join(download_directory, txt_filename)

        # Convert PDF to TXT
        text = extract_text(pdf_path)
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text)
        print(f'Converted: {pdf_path} to {txt_path}')

Converted: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2020.pdf to C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2020.txt
Converted: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2021.pdf to C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2021.txt
Converted: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2022.pdf to C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2022.txt


## Erasing the PDFs

In [12]:
import os

download_directory = r'C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2'

# Remove the PDF files
for filename in os.listdir(download_directory):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(download_directory, filename)
        
        # Remove the PDF file
        os.remove(pdf_path)
        print(f'Removed: {pdf_path}')


Removed: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2020.pdf
Removed: C:\Users\phabi\Desktop\MESTRADO\FGV\Materias\4 Quarto\AI Bisness\Downloaded_PDFs\Teste2\Relato Integrado 2021.pdf
