Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cria worker para converter arquivos PDF para TXT #119

Merged
merged 10 commits into from Jul 18, 2023
3 changes: 3 additions & 0 deletions apps/cotacoes/lib/cotacoes/handlers/cotacao_handler.ex
Expand Up @@ -6,6 +6,9 @@ defmodule Cotacoes.Handlers.CotacaoHandler do
@impl true
defdelegate list_cotacao, to: Repository

@impl true
defdelegate fetch_cotacao_by_link(link), to: Repository

@impl true
def find_cotacoes_not_ingested do
Repository.find_all_cotacao_by_not_ingested()
Expand Down
Expand Up @@ -3,6 +3,7 @@ defmodule Cotacoes.Handlers.IManageCotacaoHandler do

@callback find_cotacoes_not_ingested :: list(Cotacao.t())
@callback find_cotacoes_not_downloaded :: list(Cotacao.t())
@callback fetch_cotacao_by_link(String.t()) :: {:ok, Cotacao.t()} | {:error, :not_found}
@callback get_cotacao_file_base_name(Cotacao.t()) :: String.t()
@callback ingest_cotacoes(list(Cotacao.t())) :: :ok
@callback insert_cotacoes!(list(Cotacao.t())) :: :ok
Expand Down
1 change: 1 addition & 0 deletions apps/cotacoes/lib/cotacoes/i_manage_repository.ex
Expand Up @@ -7,6 +7,7 @@ defmodule Cotacoes.IManageRepository do

@callback find_all_cotacao_by_not_ingested :: list(Cotacao.t())
@callback find_all_cotacao_by_not_downloaded :: list(Cotacao.t())
@callback fetch_cotacao_by_link(String.t()) :: {:ok, Cotacao.t()} | {:error, :not_found}
@callback insert_all_cotacao(list(map)) :: :ok
@callback list_cotacao :: list(Cotacao.t())
@callback update_all_cotacao(list(Cotacao.t()), keyword) :: {:ok, list(Cotacao.t()) | nil}
Expand Down
5 changes: 5 additions & 0 deletions apps/cotacoes/lib/cotacoes/repository.ex
Expand Up @@ -13,6 +13,11 @@ defmodule Cotacoes.Repository do
Repo.Replica.all(query)
end

@impl true
def fetch_cotacao_by_link(link) do
Database.fetch_by(Cotacao, link: link)
end

@impl true
def find_all_cotacao_by_not_downloaded do
query = from c in Cotacao, where: not c.baixada?, select: c
Expand Down
17 changes: 15 additions & 2 deletions apps/cotacoes_etl/lib/cotacoes_etl/application.ex
@@ -1,13 +1,22 @@
defmodule CotacoesETL.Application do
use Application

alias CotacoesETL.Workers.PDFConverter
alias CotacoesETL.Workers.Pesagro.BoletimDownloader
alias CotacoesETL.Workers.Pesagro.BoletinsFetcher
alias CotacoesETL.Workers.ZIPExtractor

@impl true
def start(_, _) do
children =
if config_env() != :test do
[BoletinsFetcher, {Finch, name: PescarteHTTPClient}]
if config_env() != :test or should_fetch_pesagro_cotacoes?() do
[
PDFConverter,
ZIPExtractor,
BoletinsFetcher,
BoletimDownloader,
{Finch, name: PescarteHTTPClient}
]
else
[{Finch, name: PescarteHTTPClient}]
end
Expand All @@ -19,4 +28,8 @@ defmodule CotacoesETL.Application do
defp config_env do
Application.get_env(:cotacoes_etl, :config_env)
end

defp should_fetch_pesagro_cotacoes? do
Application.get_env(:cotacoes_etl, :fetch_pesagro_cotacoes, false)
end
end
17 changes: 17 additions & 0 deletions apps/cotacoes_etl/lib/cotacoes_etl/handlers.ex
@@ -0,0 +1,17 @@
defmodule CotacoesETL.Handlers do
alias CotacoesETL.Handlers.PDFConverterHandler
alias CotacoesETL.Handlers.PesagroHandler
alias CotacoesETL.Handlers.ZIPExtractorHandler

def pesagro_handler do
Application.get_env(:cotacoes_etl, :pesagro_handler, PesagroHandler)
end

def pdf_converter_handler do
Application.get_env(:cotacoes_etl, :pdf_converter_handler, PDFConverterHandler)
end

def zip_extractor_handler do
Application.get_env(:cotacoes_etl, :zip_extractor_handler, ZIPExtractorHandler)
end
end
@@ -0,0 +1,4 @@
defmodule CotacoesETL.Handlers.IManagePDFConverterHandler do
@callback trigger_pdf_conversion_to_txt(Path.t(), Path.t(), pid) :: :ok
@callback convert_to_txt!(Path.t()) :: binary
end
@@ -0,0 +1,7 @@
defmodule CotacoesETL.Handlers.IManagePesagroHandler do
alias Cotacoes.Models.Cotacao
alias CotacoesETL.Schemas.Pesagro.BoletimEntry

@callback is_zip_file?(BoletimEntry.t()) :: boolean
@callback download_boletim_from_pesagro!(Path.t(), Cotacao.t()) :: Path.t()
end
@@ -0,0 +1,4 @@
defmodule CotacoesETL.Handlers.IManageZIPExtractorHandler do
@callback trigger_extract_zip_to_path(Path.t(), Path.t(), pid) :: :ok
@callback extract_zip_to!(Path.t(), Path.t()) :: list(Path.t())
end
@@ -0,0 +1,31 @@
defmodule CotacoesETL.Handlers.PDFConverterHandler do
alias CotacoesETL.Handlers.IManagePDFConverterHandler
alias CotacoesETL.Workers.PDFConverter

@behaviour IManagePDFConverterHandler

# requires ghostscript to be installed first - on mac, install with `brew install ghostscript`
# -sDEVICE=txtwrite - text writer
# -sOutputFile=- - use stdout instead of a file
# -q - quiet - prevent writing normal messages to output
# -dNOPAUSE - disable prompt and pause at end of each page
# -dBATCH - indicates batch operation so exits at end of processing
@ghostscript_args ~w(-sDEVICE=txtwrite -sOutputFile=- -q -dNOPAUSE -dBATCH)
defp mk_ghostscript_args(input_file) do
List.insert_at(@ghostscript_args, -1, input_file)
end

@impl IManagePDFConverterHandler
def trigger_pdf_conversion_to_txt(file_path, dest_path, caller) do
GenServer.cast(
PDFConverter,
{:convert, caller: caller, from: file_path, to: dest_path, format: :txt}
)
end

@impl IManagePDFConverterHandler
def convert_to_txt!(file_path) do
{txt_content, 0} = System.cmd("gs", mk_ghostscript_args(file_path))
txt_content
end
end
22 changes: 22 additions & 0 deletions apps/cotacoes_etl/lib/cotacoes_etl/handlers/pesagro_handler.ex
@@ -0,0 +1,22 @@
defmodule CotacoesETL.Handlers.PesagroHandler do
import CotacoesETL.Integrations

alias Cotacoes.Handlers.CotacaoHandler
alias CotacoesETL.Handlers.IManagePesagroHandler

@behaviour IManagePesagroHandler

@impl true
def is_zip_file?(boletim), do: boletim.tipo == :zip

@impl true
def download_boletim_from_pesagro!(storage_path, cotacao) do
content = pesagro_api().download_file!(cotacao.link)
base_name = CotacaoHandler.get_cotacao_file_base_name(cotacao)
file_path = storage_path <> base_name
File.write!(file_path, content)
{:ok, _cotacao} = CotacaoHandler.set_cotacao_downloaded(cotacao)

file_path
end
end
@@ -0,0 +1,33 @@
defmodule CotacoesETL.Handlers.ZIPExtractorHandler do
alias CotacoesETL.Handlers.IManageZIPExtractorHandler
alias CotacoesETL.Workers.ZIPExtractor

@behaviour IManageZIPExtractorHandler

@impl true
def trigger_extract_zip_to_path(file_path, dest_path, caller) do
GenServer.cast(ZIPExtractor, {:extract, file_path, dest_path, caller})
end

@impl true
def extract_zip_to!(zip_path, storage_path) do
{:ok, unzip} =
zip_path
|> Unzip.LocalFile.open()
|> Unzip.new()

for entry <- Unzip.list_entries(unzip) do
path = storage_path <> entry.file_name

file_binary =
unzip
|> Unzip.file_stream!(entry.file_name)
|> Enum.into([])
|> IO.iodata_to_binary()

:ok = File.write(path, file_binary)

path
end
end
end
5 changes: 0 additions & 5 deletions apps/cotacoes_etl/lib/cotacoes_etl/integrations.ex
@@ -1,12 +1,7 @@
defmodule CotacoesETL.Integrations do
alias CotacoesETL.Integrations.PesagroAPI
alias CotacoesETL.Integrations.ZamzarAPI

def pesagro_api do
Application.get_env(:cotacoes_etl, :pesagro_api, PesagroAPI)
end

def zamzar_api do
Application.get_env(:cotacoes_etl, :zamzar_api, ZamzarAPI)
end
end

This file was deleted.

69 changes: 0 additions & 69 deletions apps/cotacoes_etl/lib/cotacoes_etl/integrations/zamzar_api.ex

This file was deleted.

38 changes: 0 additions & 38 deletions apps/cotacoes_etl/lib/cotacoes_etl/schemas/zamzar/file.ex

This file was deleted.

44 changes: 0 additions & 44 deletions apps/cotacoes_etl/lib/cotacoes_etl/schemas/zamzar/job.ex

This file was deleted.