okfn-brasil · sergiomario · Nov 29, 2019 · Nov 10, 2019 · Nov 11, 2019 · Nov 12, 2019
diff --git a/processing/Dockerfile b/processing/Dockerfile
@@ -4,9 +4,14 @@ WORKDIR /mnt/code
 
 RUN adduser --system -u ${LOCAL_USER_ID:-1000} gazette \
   && apt-get update \
-  && apt-get -y install poppler-utils postgresql-client wait-for-it
+  && apt-get -y install poppler-utils postgresql-client wait-for-it default-jre
 
 COPY requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
+# install Apache Tika
+ADD http://ftp.unicamp.br/pub/apache/tika/tika-app-1.22.jar /
+
+RUN chmod 755 /tika-app-1.22.jar
+
 USER gazette
diff --git a/processing/data_collection/gazette/pipelines.py b/processing/data_collection/gazette/pipelines.py
@@ -1,5 +1,6 @@
 import os
 import subprocess
+import hashlib
 
 from database.models import Gazette, initialize_database
 from scrapy.exceptions import DropItem
@@ -9,27 +10,6 @@
 from gazette.settings import FILES_STORE
 
 
-class PdfParsingPipeline:
-    def process_item(self, item, spider):
-        item["source_text"] = self.pdf_source_text(item)
-        for key, value in item["files"][0].items():
-            item[f"file_{key}"] = value
-        item.pop("files")
-        item.pop("file_urls")
-        return item
-
-    def pdf_source_text(self, item):
-        pdf_path = os.path.join(FILES_STORE, item["files"][0]["path"])
-        command = f"pdftotext -layout {pdf_path}"
-        subprocess.run(command, shell=True, check=True)
-        if ".pdf" in pdf_path:
-            text_path = pdf_path.replace(".pdf", ".txt")
-        else:
-            text_path = pdf_path + ".txt"
-        with open(text_path) as file:
-            return file.read()
-
-
 class PostgreSQLPipeline:
     def __init__(self):
         engine = initialize_database()
@@ -61,3 +41,72 @@ def process_item(self, item, spider):
             if spider.start_date > item.get("date"):
                 raise DropItem("Droping all items before {}".format(spider.start_date))
         return item
+
+
+class ExtractTextPipeline:
+    """
+    Identify file format and call the right tool to extract the text from it
+    """
+
+    def process_item(self, item, spider):
+        if self.is_doc(item["files"][0]["path"]):
+            item["source_text"] = self.doc_source_text(item)
+        elif self.is_pdf(item["files"][0]["path"]):
+            item["source_text"] = self.pdf_source_text(item)
+        else:
+            raise Exception(
+                "Unsupported file type: " + self.get_extension(item["files"][0]["path"])
+            )
+
+        for key, value in item["files"][0].items():
+            item[f"file_{key}"] = value
+        item.pop("files")
+        item.pop("file_urls")
+        return item
+
+    def pdf_source_text(self, item):
+        """
+        Gets the text from pdf files
+        """
+        pdf_path = os.path.join(FILES_STORE, item["files"][0]["path"])
+        text_path = pdf_path + ".txt"
+        command = f"pdftotext -layout {pdf_path} {text_path}"
+        subprocess.run(command, shell=True, check=True)
+        with open(text_path) as file:
+            return file.read()
+
+    def doc_source_text(self, item):
+        """
+        Gets the text from docish files
+        """
+        doc_path = os.path.join(FILES_STORE, item["files"][0]["path"])
+        text_path = doc_path + ".txt"
+        command = f"java -jar /tika-app-1.22.jar --text {doc_path}"
+        with open(text_path, "w") as f:
+            subprocess.run(command, shell=True, check=True, stdout=f)
+        with open(text_path, "r") as f:
+            return f.read()
+
+    @staticmethod
+    def is_pdf(filepath):
+        """
+        If the file path ends with pdf returns True. Otherwise,
+        returns False
+        """
+        return filepath.lower().endswith("pdf")
+
+    @staticmethod
+    def is_doc(filepath):
+        """
+        If the file path ends with doc or docx returns True. Otherwise,
+        returns False
+        """
+        filepath = filepath.lower()
+        return filepath.endswith("doc") or filepath.endswith("docx")
+
+    @staticmethod
+    def get_extension(filename):
+        """
+        Returns the file's extension
+        """
+        return filename[filename.rfind(".") :]
diff --git a/processing/data_collection/gazette/settings.py b/processing/data_collection/gazette/settings.py
@@ -6,7 +6,7 @@
     "gazette.pipelines.GazetteDateFilteringPipeline": 50,
     "gazette.parser.GazetteFilesPipeline": 60,
     "scrapy.pipelines.files.FilesPipeline": 100,
-    "gazette.pipelines.PdfParsingPipeline": 200,
+    "gazette.pipelines.ExtractTextPipeline": 200,
     "gazette.pipelines.PostgreSQLPipeline": 300,
 }
 FILES_STORE = "/mnt/data/"
diff --git a/processing/data_collection/gazette/spiders/base.py b/processing/data_collection/gazette/spiders/base.py
@@ -1,7 +1,12 @@
 # -*- coding: utf-8 -*-
+import re
+from datetime import datetime
+
 import dateparser
 import scrapy
 
+from gazette.items import Gazette
+
 
 class BaseGazetteSpider(scrapy.Spider):
     def __init__(self, start_date=None, *args, **kwargs):
@@ -11,3 +16,75 @@ def __init__(self, start_date=None, *args, **kwargs):
             parsed_data = dateparser.parse(start_date)
             if parsed_data is not None:
                 self.start_date = parsed_data.date()
+
+
+class FecamGazetteSpider(scrapy.Spider):
+
+    URL = "https://www.diariomunicipal.sc.gov.br/site/"
+    total_pages = None
+
+    def start_requests(self):
+        if self.total_pages is None:
+            yield scrapy.Request(
+                f"{self.URL}?q={self.FECAM_QUERY}", callback=self.parse
+            )
+
+    def parse(self, response):
+        if self.total_pages is None:
+            self.total_pages = self.get_last_page(response)
+        # Get gazzete info
+        documents = self.get_documents_links_date(response)
+        for d in documents:
+            yield self.get_gazette(d)
+        if self.total_pages > 1:
+            yield scrapy.Request(
+                f"{self.URL}?q={self.FECAM_QUERY}&Search_page={self.total_pages}",
+                callback=self.parse,
+            )
+            self.total_pages = self.total_pages - 1
+
+    def get_documents_links_date(self, response):
+        """
+        Method to get all the relevant documents list and their dates from the page
+        """
+        documents = []
+        elements = response.xpath('/html/body/div[1]/div[3]/div[5]/p[@class="quiet"]')
+        for e in elements:
+            if "Visualizar" in e.xpath("a[1]/text()").get():
+                # The element does not contain the element with the file URL.
+                # Thus, the URL is in the preceding title
+                link = e.xpath("preceding-sibling::h4[1]/a/@href").get().strip()
+            else:
+                link = e.xpath("a[1]/@href").get().strip()
+            date = e.re_first("\d{2}/\d{2}/\d{4}").strip()
+            documents.append((link, date))
+        return documents
+
+    @staticmethod
+    def get_last_page(response):
+        """
+        Get the last page number available in the pages navigation menu
+        """
+        href = response.xpath(
+            "/html/body/div[1]/div[3]/div[4]/div/div/ul/li[14]/a/@href"
+        ).get()
+        result = re.search("Search_page=(\d+)", href)
+        if result is not None:
+            return int(result.groups()[0])
+
+    def get_gazette(self, document):
+        """
+        Transform the tuple returned by get_documents_links_date and returns a
+        Gazette item
+        """
+        if document[1] is None or len(document[1]) == 0:
+            raise "Missing document date"
+        if document[0] is None or len(document[0]) == 0:
+            raise "Missing document URL"
+
+        return Gazette(
+            date=dateparser.parse(document[1], languages=("pt",)).date(),
+            file_urls=(document[0],),
+            territory_id=self.TERRITORY_ID,
+            scraped_at=datetime.utcnow(),
+        )
diff --git a/processing/data_collection/gazette/spiders/sc_gaspar.py b/processing/data_collection/gazette/spiders/sc_gaspar.py
@@ -0,0 +1,7 @@
+from gazette.spiders.base import FecamGazetteSpider
+
+
+class ScGasparSpider(FecamGazetteSpider):
+    name = "sc_gaspar"
+    FECAM_QUERY = 'entidade:"Prefeitura municipal de Gaspar"'
+    TERRITORY_ID = "4205902"