## Obtenção de dados

Scrapping repositório UFPE

### Requirements

In [40]:
import requests
from bs4 import BeautifulSoup
import json
import os
from pathlib import Path

### Funções Auxiliares

In [41]:
def read_page_get_links(keywords: str) -> list:
    """
    Recebe a query como input e retorna os links disponiveis na primeira página do repositório da UFPE.
    
    Exemplo de query:
    
    "machine learning"
        
    """
    keywords = keywords.replace(" ", "+")
    
    response = requests.get("https://repositorio.ufpe.br/simple-search?location=123456789%2F50&query={}".format(keywords))
    soup = BeautifulSoup(response.text.strip(), 'html.parser')
    tags = soup.find_all(name='table')
    tag_table = tags[0]
    tags_a = tag_table.find_all(name='a')
    
    urls = []
    
    for i in range(0, len(tags_a), 2): #TODO: If query return 1 break
        link = tags_a[i]["href"]
        urls.append(link)
    
    return urls

In [42]:
def read_table_get_title_and_abstract(info_table) -> dict:
    """
    Recebe uma tabela de informações HTML e extrai o title e abstract
    
    """
    all_titles = info_table.find_all(name='td', attrs={'class': 'dc_title'})
    title = all_titles[-1].contents[0]

    all_abst = info_table.find_all(name='td', attrs={'class': 'dc_description_abstract'})
    abst = all_abst[-1].contents[0]

    dict_thesis_info = {}
    dict_thesis_info = {
        'title': title,
        'abstract': abst,
    }
    return dict_thesis_info

### Scrapping

In [43]:
query = "internet das coisas"

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
urls = read_page_get_links(query)

In [45]:
urls

['/handle/123456789/29048',
 '/handle/123456789/25843',
 '/handle/123456789/24694',
 '/handle/123456789/34459',
 '/handle/123456789/32121',
 '/handle/123456789/24888',
 '/handle/123456789/28006',
 '/handle/123456789/25680',
 '/handle/123456789/3997',
 '/handle/123456789/22418']

In [46]:
info_table = []

for url in urls:
    response = requests.get(f'https://repositorio.ufpe.br/{url}', allow_redirects=True)
    soup = BeautifulSoup(response.text.strip(), 'html.parser')
    tags_table = soup.find_all(name='table', attrs={'class': 'itemDisplayTable'})
    info_table.append(tags_table[0])

In [47]:
len(info_table)

10

In [48]:
print(info_table)

[<table class="table itemDisplayTable">
<tr><td class="metadataFieldLabel dc_title">Title: </td><td class="metadataFieldValue dc_title">Dependência de Internet: um estudo das propriedades psicométricas da versão adaptada ao português brasileiro do Internet Addiction Test (IAT)</td></tr>
<tr><td class="metadataFieldLabel dc_contributor_author">Authors: </td><td class="metadataFieldValue dc_contributor_author"><a class="author" href="/browse?type=author&amp;value=ALMO%C3%8ADO+DE+ASSIS%2C+Ra%C3%ADssa">ALMOÊDO DE ASSIS, Raíssa</a></td></tr>
<tr><td class="metadataFieldLabel dc_subject">Keywords: </td><td class="metadataFieldValue dc_subject">Dependência de internet; IAT; Psicometria; Avaliação psicológica</td></tr>
<tr><td class="metadataFieldLabel dc_date_issued">Issue Date: </td><td class="metadataFieldValue dc_date_issued">25-Feb-2014</td></tr>
<tr><td class="metadataFieldLabel dc_publisher">Publisher: </td><td class="metadataFieldValue dc_publisher">Universidade Federal de Pernambuco</

### Saving files

In [49]:
counter = 1
path = Path("./output")
query = query.replace(" ", "_")

if not path.exists():
    path.mkdir() #Se pasta não existir, cria pasta output
    
if not Path(f"{path}/{query}").exists():
    Path(f"{path}/{query}").mkdir() #Se pasta da query não existir, cria pasta da query dentro de output

for table in info_table:
    output = read_table_get_title_and_abstract(table)
    
    with open(f"./output/{query}/thesis{counter}.json", 'w') as json_file:
        json.dump(output, json_file) #Salva os textos como json
        
    counter = counter + 1

In [52]:
!zip -r /content/output.zip /content/output

  adding: content/output/ (stored 0%)
  adding: content/output/doenças_hereditárias/ (stored 0%)
  adding: content/output/doenças_hereditárias/thesis9.json (deflated 57%)
  adding: content/output/doenças_hereditárias/thesis1.json (deflated 62%)
  adding: content/output/doenças_hereditárias/thesis10.json (deflated 58%)
  adding: content/output/doenças_hereditárias/thesis2.json (deflated 52%)
  adding: content/output/doenças_hereditárias/thesis3.json (deflated 57%)
  adding: content/output/doenças_hereditárias/thesis4.json (deflated 61%)
  adding: content/output/doenças_hereditárias/thesis7.json (deflated 60%)
  adding: content/output/doenças_hereditárias/thesis5.json (deflated 52%)
  adding: content/output/doenças_hereditárias/thesis6.json (deflated 56%)
  adding: content/output/doenças_hereditárias/thesis8.json (deflated 59%)
  adding: content/output/reconhecimento_facial/ (stored 0%)
  adding: content/output/reconhecimento_facial/thesis9.json (deflated 57%)
  adding: content/output/re