Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fecam #135

Merged
merged 11 commits into from Nov 29, 2019
3 changes: 2 additions & 1 deletion processing/Dockerfile
Expand Up @@ -4,7 +4,8 @@ WORKDIR /mnt/code

RUN adduser --system -u ${LOCAL_USER_ID:-1000} gazette \
&& apt-get update \
&& apt-get -y install poppler-utils postgresql-client wait-for-it
&& apt-get -y install poppler-utils postgresql-client wait-for-it libreoffice
# Install libreoffice is necessary to allow convert doc like file to pdf

COPY requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
Expand Down
49 changes: 49 additions & 0 deletions processing/data_collection/gazette/pipelines.py
@@ -1,5 +1,6 @@
import os
import subprocess
import hashlib

from database.models import Gazette, initialize_database
from scrapy.exceptions import DropItem
Expand Down Expand Up @@ -61,3 +62,51 @@ def process_item(self, item, spider):
if spider.start_date > item.get("date"):
raise DropItem("Droping all items before {}".format(spider.start_date))
return item


class DocToPdfPipeline:
"""
Convert a doc[x] file to pdf
"""

def process_item(self, item, spider):
# if item is not a doc, skip it
if not self.is_doc(item["files"][0]["path"]):
return item
# it's doc[x]. Convert it to pdf
doc_path = os.path.join(FILES_STORE, item["files"][0]["path"])
# use libreoffice writer to convert
command = f"lowriter --convert-to pdf --outdir {FILES_STORE}/full {doc_path}"
subprocess.run(command, shell=True, check=True)
if doc_path.endswith("doc"):
pdf_path = doc_path[:-3] + "pdf"
elif doc_path.endswith("docx"):
pdf_path = doc_path[:-4] + "pdf"
else:
pdf_path = doc_path + ".pdf"
jvanz marked this conversation as resolved.
Show resolved Hide resolved
os.unlink(doc_path)
jvanz marked this conversation as resolved.
Show resolved Hide resolved
# update to the new file path and its checksum
item["files"][0]["path"] = pdf_path
item["files"][0]["checksum"] = self.calculate_md5sum(pdf_path)
return item

@staticmethod
def is_doc(filepath):
"""
If the file path ends with doc or docx returns True. Otherwise,
returns False
"""
return filepath.endswith("doc") or filepath.endswith("docx")
jvanz marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def calculate_md5sum(filepath):
"""
Get the md5sum of the given file

Returns string of the md5sum
"""
hash_md5 = hashlib.md5()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
1 change: 1 addition & 0 deletions processing/data_collection/gazette/settings.py
Expand Up @@ -6,6 +6,7 @@
"gazette.pipelines.GazetteDateFilteringPipeline": 50,
"gazette.parser.GazetteFilesPipeline": 60,
"scrapy.pipelines.files.FilesPipeline": 100,
"gazette.pipelines.DocToPdfPipeline": 150,
"gazette.pipelines.PdfParsingPipeline": 200,
"gazette.pipelines.PostgreSQLPipeline": 300,
}
Expand Down
76 changes: 76 additions & 0 deletions processing/data_collection/gazette/spiders/base.py
@@ -1,6 +1,10 @@
# -*- coding: utf-8 -*-
import dateparser
import scrapy
import re
from datetime import datetime

from gazette.items import Gazette
jvanz marked this conversation as resolved.
Show resolved Hide resolved


class BaseGazetteSpider(scrapy.Spider):
Expand All @@ -11,3 +15,75 @@ def __init__(self, start_date=None, *args, **kwargs):
parsed_data = dateparser.parse(start_date)
if parsed_data is not None:
self.start_date = parsed_data.date()


class FecamGazetteSpider(scrapy.Spider):

URL = "https://www.diariomunicipal.sc.gov.br/site/"
total_pages = None

def start_requests(self):
if self.total_pages is None:
yield scrapy.Request(
f"{self.URL}?q={self.FECAM_QUERY}", callback=self.parse
)

def parse(self, response):
if self.total_pages is None:
self.total_pages = self.get_last_page(response)
# Get gazzete info
documents = self.get_documents_links_date(response)
for d in documents:
yield self.get_gazzete(d)
if self.total_pages > 1:
yield scrapy.Request(
f"{self.URL}?q={self.FECAM_QUERY}&Search_page={self.total_pages}",
callback=self.parse,
)
self.total_pages = self.total_pages - 1

def get_documents_links_date(self, response):
"""
Method to get all the relevant documents list and their dates from the page
"""
documents = []
elements = response.xpath('/html/body/div[1]/div[3]/div[5]/p[@class="quiet"]')
for e in elements:
if "Visualizar" in e.xpath("a[1]/text()").get():
# The element does not contain the element with the file URL.
# Thus, the URL is in the preceding title
link = e.xpath("preceding-sibling::h4[1]/a/@href").get().strip()
else:
link = e.xpath("a[1]/@href").get().strip()
date = e.re_first("\d{2}/\d{2}/\d{4}").strip()
documents.append((link, date))
return documents

@staticmethod
def get_last_page(response):
"""
Get the last page number available in the pages navigation menu
"""
href = response.xpath(
"/html/body/div[1]/div[3]/div[4]/div/div/ul/li[14]/a/@href"
).get()
result = re.search("Search_page=(\d+)", href)
if result is not None:
return int(result.groups()[0])

def get_gazzete(self, document):
jvanz marked this conversation as resolved.
Show resolved Hide resolved
"""
Transform the tuple returned by get_documents_links_date and returns a
Gazette item
"""
if document[1] is None or len(document[1]) == 0:
raise "Missing document date"
if document[0] is None or len(document[0]) == 0:
raise "Missing document URL"

return Gazette(
date=dateparser.parse(document[1], languages=("pt",)).date(),
file_urls=(document[0],),
territory_id=self.TERRITORY_ID,
scraped_at=datetime.utcnow(),
)
14 changes: 14 additions & 0 deletions processing/data_collection/gazette/spiders/sc_gaspar.py
@@ -0,0 +1,14 @@
import re
from datetime import date, datetime

from dateparser import parse
from dateutil.relativedelta import relativedelta

from gazette.items import Gazette
from gazette.spiders.base import FecamGazetteSpider


class ScGasparSpider(FecamGazetteSpider):
name = "sc_gaspar"
FECAM_QUERY = 'entidade:"Prefeitura municipal de Gaspar"'
TERRITORY_ID = "4205902"