Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fecam #135

Merged
merged 11 commits into from Nov 29, 2019
7 changes: 6 additions & 1 deletion processing/Dockerfile
Expand Up @@ -4,9 +4,14 @@ WORKDIR /mnt/code

RUN adduser --system -u ${LOCAL_USER_ID:-1000} gazette \
&& apt-get update \
&& apt-get -y install poppler-utils postgresql-client wait-for-it
&& apt-get -y install poppler-utils postgresql-client wait-for-it default-jre

COPY requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# install Apache Tika
ADD http://ftp.unicamp.br/pub/apache/tika/tika-app-1.22.jar /

RUN chmod 755 /tika-app-1.22.jar

USER gazette
91 changes: 70 additions & 21 deletions processing/data_collection/gazette/pipelines.py
@@ -1,5 +1,6 @@
import os
import subprocess
import hashlib

from database.models import Gazette, initialize_database
from scrapy.exceptions import DropItem
Expand All @@ -9,27 +10,6 @@
from gazette.settings import FILES_STORE


class PdfParsingPipeline:
def process_item(self, item, spider):
item["source_text"] = self.pdf_source_text(item)
for key, value in item["files"][0].items():
item[f"file_{key}"] = value
item.pop("files")
item.pop("file_urls")
return item

def pdf_source_text(self, item):
pdf_path = os.path.join(FILES_STORE, item["files"][0]["path"])
command = f"pdftotext -layout {pdf_path}"
subprocess.run(command, shell=True, check=True)
if ".pdf" in pdf_path:
text_path = pdf_path.replace(".pdf", ".txt")
else:
text_path = pdf_path + ".txt"
with open(text_path) as file:
return file.read()


class PostgreSQLPipeline:
def __init__(self):
engine = initialize_database()
Expand Down Expand Up @@ -61,3 +41,72 @@ def process_item(self, item, spider):
if spider.start_date > item.get("date"):
raise DropItem("Droping all items before {}".format(spider.start_date))
return item


class ExtractTextPipeline:
"""
Identify file format and call the right tool to extract the text from it
"""

def process_item(self, item, spider):
if self.is_doc(item["files"][0]["path"]):
item["source_text"] = self.doc_source_text(item)
elif self.is_pdf(item["files"][0]["path"]):
item["source_text"] = self.pdf_source_text(item)
else:
raise Exception(
"Unsupported file type: " + self.get_extension(item["files"][0]["path"])
)

for key, value in item["files"][0].items():
item[f"file_{key}"] = value
item.pop("files")
item.pop("file_urls")
return item

def pdf_source_text(self, item):
"""
Gets the text from pdf files
"""
pdf_path = os.path.join(FILES_STORE, item["files"][0]["path"])
text_path = pdf_path + ".txt"
command = f"pdftotext -layout {pdf_path} {text_path}"
subprocess.run(command, shell=True, check=True)
with open(text_path) as file:
return file.read()

def doc_source_text(self, item):
"""
Gets the text from docish files
"""
doc_path = os.path.join(FILES_STORE, item["files"][0]["path"])
text_path = doc_path + ".txt"
command = f"java -jar /tika-app-1.22.jar --text {doc_path}"
with open(text_path, "w") as f:
subprocess.run(command, shell=True, check=True, stdout=f)
with open(text_path, "r") as f:
return f.read()

@staticmethod
def is_pdf(filepath):
"""
If the file path ends with pdf returns True. Otherwise,
returns False
"""
return filepath.lower().endswith("pdf")

@staticmethod
def is_doc(filepath):
"""
If the file path ends with doc or docx returns True. Otherwise,
returns False
"""
filepath = filepath.lower()
return filepath.endswith("doc") or filepath.endswith("docx")
jvanz marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def get_extension(filename):
"""
Returns the file's extension
"""
return filename[filename.rfind(".") :]
2 changes: 1 addition & 1 deletion processing/data_collection/gazette/settings.py
Expand Up @@ -6,7 +6,7 @@
"gazette.pipelines.GazetteDateFilteringPipeline": 50,
"gazette.parser.GazetteFilesPipeline": 60,
"scrapy.pipelines.files.FilesPipeline": 100,
"gazette.pipelines.PdfParsingPipeline": 200,
"gazette.pipelines.ExtractTextPipeline": 200,
"gazette.pipelines.PostgreSQLPipeline": 300,
}
FILES_STORE = "/mnt/data/"
77 changes: 77 additions & 0 deletions processing/data_collection/gazette/spiders/base.py
@@ -1,7 +1,12 @@
# -*- coding: utf-8 -*-
import re
from datetime import datetime

import dateparser
import scrapy

from gazette.items import Gazette
jvanz marked this conversation as resolved.
Show resolved Hide resolved


class BaseGazetteSpider(scrapy.Spider):
def __init__(self, start_date=None, *args, **kwargs):
Expand All @@ -11,3 +16,75 @@ def __init__(self, start_date=None, *args, **kwargs):
parsed_data = dateparser.parse(start_date)
if parsed_data is not None:
self.start_date = parsed_data.date()


class FecamGazetteSpider(scrapy.Spider):

URL = "https://www.diariomunicipal.sc.gov.br/site/"
total_pages = None

def start_requests(self):
if self.total_pages is None:
yield scrapy.Request(
f"{self.URL}?q={self.FECAM_QUERY}", callback=self.parse
)

def parse(self, response):
if self.total_pages is None:
self.total_pages = self.get_last_page(response)
# Get gazzete info
documents = self.get_documents_links_date(response)
for d in documents:
yield self.get_gazette(d)
if self.total_pages > 1:
yield scrapy.Request(
f"{self.URL}?q={self.FECAM_QUERY}&Search_page={self.total_pages}",
callback=self.parse,
)
self.total_pages = self.total_pages - 1

def get_documents_links_date(self, response):
"""
Method to get all the relevant documents list and their dates from the page
"""
documents = []
elements = response.xpath('/html/body/div[1]/div[3]/div[5]/p[@class="quiet"]')
for e in elements:
if "Visualizar" in e.xpath("a[1]/text()").get():
# The element does not contain the element with the file URL.
# Thus, the URL is in the preceding title
link = e.xpath("preceding-sibling::h4[1]/a/@href").get().strip()
else:
link = e.xpath("a[1]/@href").get().strip()
date = e.re_first("\d{2}/\d{2}/\d{4}").strip()
documents.append((link, date))
return documents

@staticmethod
def get_last_page(response):
"""
Get the last page number available in the pages navigation menu
"""
href = response.xpath(
"/html/body/div[1]/div[3]/div[4]/div/div/ul/li[14]/a/@href"
).get()
result = re.search("Search_page=(\d+)", href)
if result is not None:
return int(result.groups()[0])

def get_gazette(self, document):
"""
Transform the tuple returned by get_documents_links_date and returns a
Gazette item
"""
if document[1] is None or len(document[1]) == 0:
raise "Missing document date"
if document[0] is None or len(document[0]) == 0:
raise "Missing document URL"

return Gazette(
date=dateparser.parse(document[1], languages=("pt",)).date(),
file_urls=(document[0],),
territory_id=self.TERRITORY_ID,
scraped_at=datetime.utcnow(),
)
7 changes: 7 additions & 0 deletions processing/data_collection/gazette/spiders/sc_gaspar.py
@@ -0,0 +1,7 @@
from gazette.spiders.base import FecamGazetteSpider


class ScGasparSpider(FecamGazetteSpider):
name = "sc_gaspar"
FECAM_QUERY = 'entidade:"Prefeitura municipal de Gaspar"'
TERRITORY_ID = "4205902"