okfn-brasil · sergiomario · Nov 29, 2019 · Nov 10, 2019 · Nov 11, 2019 · Nov 12, 2019
diff --git a/processing/Dockerfile b/processing/Dockerfile
@@ -4,7 +4,8 @@ WORKDIR /mnt/code
 
 RUN adduser --system -u ${LOCAL_USER_ID:-1000} gazette \
   && apt-get update \
-  && apt-get -y install poppler-utils postgresql-client wait-for-it
+  && apt-get -y install poppler-utils postgresql-client wait-for-it libreoffice
+# Install libreoffice is necessary to allow convert doc like file to pdf
 
 COPY requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt

diff --git a/processing/data_collection/gazette/pipelines.py b/processing/data_collection/gazette/pipelines.py
@@ -1,5 +1,6 @@
 import os
 import subprocess
+import hashlib
 
 from database.models import Gazette, initialize_database
 from scrapy.exceptions import DropItem
@@ -61,3 +62,51 @@ def process_item(self, item, spider):
             if spider.start_date > item.get("date"):
                 raise DropItem("Droping all items before {}".format(spider.start_date))
         return item
+
+
+class DocToPdfPipeline:
+    """
+    Convert a doc[x] file to pdf
+    """
+
+    def process_item(self, item, spider):
+        # if item is not a doc, skip it
+        if not self.is_doc(item["files"][0]["path"]):
+            return item
+        # it's doc[x]. Convert it to pdf
+        doc_path = os.path.join(FILES_STORE, item["files"][0]["path"])
+        # use libreoffice writer to convert
+        command = f"lowriter --convert-to pdf --outdir {FILES_STORE}/full {doc_path}"
+        subprocess.run(command, shell=True, check=True)
+        if doc_path.endswith("doc"):
+            pdf_path = doc_path[:-3] + "pdf"
+        elif doc_path.endswith("docx"):
+            pdf_path = doc_path[:-4] + "pdf"
+        else:
+            pdf_path = doc_path + ".pdf"
+        os.unlink(doc_path)
+        # update to the new file path and its checksum
+        item["files"][0]["path"] = pdf_path
+        item["files"][0]["checksum"] = self.calculate_md5sum(pdf_path)
+        return item
+
+    @staticmethod
+    def is_doc(filepath):
+        """
+        If the file path ends with doc or docx returns True. Otherwise,
+        returns False
+        """
+        return filepath.endswith("doc") or filepath.endswith("docx")
+
+    @staticmethod
+    def calculate_md5sum(filepath):
+        """
+        Get the md5sum of the given file
+
+        Returns string of the md5sum
+        """
+        hash_md5 = hashlib.md5()
+        with open(filepath, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
diff --git a/processing/data_collection/gazette/settings.py b/processing/data_collection/gazette/settings.py
@@ -6,6 +6,7 @@
     "gazette.pipelines.GazetteDateFilteringPipeline": 50,
     "gazette.parser.GazetteFilesPipeline": 60,
     "scrapy.pipelines.files.FilesPipeline": 100,
+    "gazette.pipelines.DocToPdfPipeline": 150,
     "gazette.pipelines.PdfParsingPipeline": 200,
     "gazette.pipelines.PostgreSQLPipeline": 300,
 }

diff --git a/processing/data_collection/gazette/spiders/base.py b/processing/data_collection/gazette/spiders/base.py
@@ -1,6 +1,10 @@
 # -*- coding: utf-8 -*-
 import dateparser
 import scrapy
+import re
+from datetime import datetime
+
+from gazette.items import Gazette
 
 
 class BaseGazetteSpider(scrapy.Spider):
@@ -11,3 +15,75 @@ def __init__(self, start_date=None, *args, **kwargs):
             parsed_data = dateparser.parse(start_date)
             if parsed_data is not None:
                 self.start_date = parsed_data.date()
+
+
+class FecamGazetteSpider(scrapy.Spider):
+
+    URL = "https://www.diariomunicipal.sc.gov.br/site/"
+    total_pages = None
+
+    def start_requests(self):
+        if self.total_pages is None:
+            yield scrapy.Request(
+                f"{self.URL}?q={self.FECAM_QUERY}", callback=self.parse
+            )
+
+    def parse(self, response):
+        if self.total_pages is None:
+            self.total_pages = self.get_last_page(response)
+        # Get gazzete info
+        documents = self.get_documents_links_date(response)
+        for d in documents:
+            yield self.get_gazzete(d)
+        if self.total_pages > 1:
+            yield scrapy.Request(
+                f"{self.URL}?q={self.FECAM_QUERY}&Search_page={self.total_pages}",
+                callback=self.parse,
+            )
+            self.total_pages = self.total_pages - 1
+
+    def get_documents_links_date(self, response):
+        """
+        Method to get all the relevant documents list and their dates from the page
+        """
+        documents = []
+        elements = response.xpath('/html/body/div[1]/div[3]/div[5]/p[@class="quiet"]')
+        for e in elements:
+            if "Visualizar" in e.xpath("a[1]/text()").get():
+                # The element does not contain the element with the file URL.
+                # Thus, the URL is in the preceding title
+                link = e.xpath("preceding-sibling::h4[1]/a/@href").get().strip()
+            else:
+                link = e.xpath("a[1]/@href").get().strip()
+            date = e.re_first("\d{2}/\d{2}/\d{4}").strip()
+            documents.append((link, date))
+        return documents
+
+    @staticmethod
+    def get_last_page(response):
+        """
+        Get the last page number available in the pages navigation menu
+        """
+        href = response.xpath(
+            "/html/body/div[1]/div[3]/div[4]/div/div/ul/li[14]/a/@href"
+        ).get()
+        result = re.search("Search_page=(\d+)", href)
+        if result is not None:
+            return int(result.groups()[0])
+
+    def get_gazzete(self, document):
+        """
+        Transform the tuple returned by get_documents_links_date and returns a
+        Gazette item
+        """
+        if document[1] is None or len(document[1]) == 0:
+            raise "Missing document date"
+        if document[0] is None or len(document[0]) == 0:
+            raise "Missing document URL"
+
+        return Gazette(
+            date=dateparser.parse(document[1], languages=("pt",)).date(),
+            file_urls=(document[0],),
+            territory_id=self.TERRITORY_ID,
+            scraped_at=datetime.utcnow(),
+        )
diff --git a/processing/data_collection/gazette/spiders/sc_gaspar.py b/processing/data_collection/gazette/spiders/sc_gaspar.py
@@ -0,0 +1,14 @@
+import re
+from datetime import date, datetime
+
+from dateparser import parse
+from dateutil.relativedelta import relativedelta
+
+from gazette.items import Gazette
+from gazette.spiders.base import FecamGazetteSpider
+
+
+class ScGasparSpider(FecamGazetteSpider):
+    name = "sc_gaspar"
+    FECAM_QUERY = 'entidade:"Prefeitura municipal de Gaspar"'
+    TERRITORY_ID = "4205902"