lib/logstash/outputs/elasticsearch/common.rb

require "logstash/outputs/elasticsearch/template_manager"

module LogStash; module Outputs; class ElasticSearch;
  module Common
    attr_reader :client, :hosts

    # These are codes for temporary recoverable conditions
    # 429 just means that ES has too much traffic ATM
    # 503 means it , or a proxy is temporarily unavailable
    RETRYABLE_CODES = [429, 503]

    DLQ_CODES = [400, 404]
    SUCCESS_CODES = [200, 201]
    CONFLICT_CODE = 409

    # When you use external versioning, you are communicating that you want
    # to ignore conflicts. More obviously, since an external version is a 
    # constant part of the incoming document, we should not retry, as retrying
    # will never succeed. 
    VERSION_TYPES_PERMITTING_CONFLICT = ["external", "external_gt", "external_gte"]

    def register
      @stopping = Concurrent::AtomicBoolean.new(false)
      # To support BWC, we check if DLQ exists in core (< 5.4). If it doesn't, we use nil to resort to previous behavior.
      @dlq_writer = supports_dlq? ? execution_context.dlq_writer : nil

      setup_hosts # properly sets @hosts
      build_client
      install_template
      check_action_validity

      @logger.info("New Elasticsearch output", :class => self.class.name, :hosts => @hosts.map(&:sanitized))
    end

    # Receive an array of events and immediately attempt to index them (no buffering)
    def multi_receive(events)
      if @flush_size
        events.each_slice(@flush_size) do |slice|
          retrying_submit(slice.map {|e| event_action_tuple(e) })
        end
      else
        retrying_submit(events.map {|e| event_action_tuple(e)})
      end
    end

    # Convert the event into a 3-tuple of action, params, and event
    def event_action_tuple(event)
      params = event_action_params(event)
      action = event.sprintf(@action)
      [action, params, event]
    end

    def setup_hosts
      @hosts = Array(@hosts)
      if @hosts.empty?
        @logger.info("No 'host' set in elasticsearch output. Defaulting to localhost")
        @hosts.replace(["localhost"])
      end
    end

    def install_template
      TemplateManager.install_template(self)
    end

    def check_action_validity
      raise LogStash::ConfigurationError, "No action specified!" unless @action

      # If we're using string interpolation, we're good!
      return if @action =~ /%{.+}/
      return if valid_actions.include?(@action)

      raise LogStash::ConfigurationError, "Action '#{@action}' is invalid! Pick one of #{valid_actions} or use a sprintf style statement"
    end

    # To be overidden by the -java version
    VALID_HTTP_ACTIONS=["index", "delete", "create", "update"]
    def valid_actions
      VALID_HTTP_ACTIONS
    end

    def retrying_submit(actions)
      # Initially we submit the full list of actions
      submit_actions = actions

      sleep_interval = @retry_initial_interval

      while submit_actions && submit_actions.length > 0

        # We retry with whatever is didn't succeed
        begin
          submit_actions = submit(submit_actions)
          if submit_actions && submit_actions.size > 0
            @logger.info("Retrying individual bulk actions that failed or were rejected by the previous bulk request.", :count => submit_actions.size)
          end
        rescue => e
          @logger.error("Encountered an unexpected error submitting a bulk request! Will retry.",
                       :error_message => e.message,
                       :class => e.class.name,
                       :backtrace => e.backtrace)
        end

        # Everything was a success!
        break if !submit_actions || submit_actions.empty?

        # If we're retrying the action sleep for the recommended interval
        # Double the interval for the next time through to achieve exponential backoff
        Stud.stoppable_sleep(sleep_interval) { @stopping.true? }
        sleep_interval = next_sleep_interval(sleep_interval)
      end
    end

    def sleep_for_interval(sleep_interval)
      Stud.stoppable_sleep(sleep_interval) { @stopping.true? }
      next_sleep_interval(sleep_interval)
    end

    def next_sleep_interval(current_interval)
      doubled = current_interval * 2
      doubled > @retry_max_interval ? @retry_max_interval : doubled
    end

    def submit(actions)
      bulk_response = safe_bulk(actions)

      # If the response is nil that means we were in a retry loop
      # and aborted since we're shutting down
      # If it did return and there are no errors we're good as well
      return if bulk_response.nil? || !bulk_response["errors"]

      actions_to_retry = []
      bulk_response["items"].each_with_index do |response,idx|
        action_type, action_props = response.first

        status = action_props["status"]
        failure  = action_props["error"]
        action = actions[idx]
        action_params = action[1]

        # Retry logic: If it is success, we move on. If it is a failure, we have 3 paths:
        # - For 409, we log and drop. there is nothing we can do
        # - For a mapping error, we send to dead letter queue for a human to intervene at a later point.
        # - For everything else there's mastercard. Yep, and we retry indefinitely. This should fix #572 and other transient network issues
        if SUCCESS_CODES.include?(status)
          next
        elsif CONFLICT_CODE == status
          @logger.warn "Failed action.", status: status, action: action, response: response if !failure_type_logging_whitelist.include?(failure["type"])
          next
        elsif DLQ_CODES.include?(status)
          action_event = action[2]
          # To support bwc, we check if DLQ exists. otherwise we log and drop event (previous behavior)
          if @dlq_writer
            # TODO: Change this to send a map with { :status => status, :action => action } in the future
            @dlq_writer.write(action_event, "Could not index event to Elasticsearch. status: #{status}, action: #{action}, response: #{response}")
          else
            @logger.warn "Could not index event to Elasticsearch.", status: status, action: action, response: response
          end
          next
        else
          # only log what the user whitelisted
          @logger.info "retrying failed action with response code: #{status} (#{failure})" if !failure_type_logging_whitelist.include?(failure["type"])
          actions_to_retry << action
        end
      end

      actions_to_retry
    end

    # get the action parameters for the given event
    def event_action_params(event)
      type = get_event_type(event)

      params = {
        :_id => @document_id ? event.sprintf(@document_id) : nil,
        :_index => event.sprintf(@index),
        :_type => type,
        :_routing => @routing ? event.sprintf(@routing) : nil
      }

      if @pipeline
        params[:pipeline] = event.sprintf(@pipeline)
      end

     if @parent
        params[:parent] = event.sprintf(@parent)
      end

      if @action == 'update'
        params[:_upsert] = LogStash::Json.load(event.sprintf(@upsert)) if @upsert != ""
        params[:_script] = event.sprintf(@script) if @script != ""
        params[:_retry_on_conflict] = @retry_on_conflict
      end

      if @version
        params[:version] = event.sprintf(@version)
      end

      if @version_type
        params[:version_type] = event.sprintf(@version_type)
      end

      params
    end

    # Determine the correct value for the 'type' field for the given event
    def get_event_type(event)
      # Set the 'type' value for the index.
      type = if @document_type
               event.sprintf(@document_type)
             else
               event.get("type") || "logs"
             end

      if !(type.is_a?(String) || type.is_a?(Numeric))
        @logger.warn("Bad event type! Non-string/integer type value set!", :type_class => type.class, :type_value => type.to_s, :event => event)
      end

      type.to_s
    end

    # Rescue retryable errors during bulk submission
    def safe_bulk(actions)
      sleep_interval = @retry_initial_interval
      begin
        es_actions = actions.map {|action_type, params, event| [action_type, params, event.to_hash]}
        response = @client.bulk(es_actions)
        response
      rescue ::LogStash::Outputs::ElasticSearch::HttpClient::Pool::HostUnreachableError => e
        # If we can't even connect to the server let's just print out the URL (:hosts is actually a URL)
        # and let the user sort it out from there
        @logger.error(
          "Attempted to send a bulk request to elasticsearch'"+
            " but Elasticsearch appears to be unreachable or down!",
          :error_message => e.message,
          :class => e.class.name,
          :will_retry_in_seconds => sleep_interval
        )
        @logger.debug("Failed actions for last bad bulk request!", :actions => actions)

        # We retry until there are no errors! Errors should all go to the retry queue
        sleep_interval = sleep_for_interval(sleep_interval)
        retry unless @stopping.true?
      rescue ::LogStash::Outputs::ElasticSearch::HttpClient::Pool::NoConnectionAvailableError => e
        @logger.error(
          "Attempted to send a bulk request to elasticsearch, but no there are no living connections in the connection pool. Perhaps Elasticsearch is unreachable or down?",
          :error_message => e.message,
          :class => e.class.name,
          :will_retry_in_seconds => sleep_interval
        )
        Stud.stoppable_sleep(sleep_interval) { @stopping.true? }
        sleep_interval = next_sleep_interval(sleep_interval)
        retry unless @stopping.true?
      rescue ::LogStash::Outputs::ElasticSearch::HttpClient::Pool::BadResponseCodeError => e
        if RETRYABLE_CODES.include?(e.response_code)
          log_hash = {:code => e.response_code, :url => e.url.sanitized}
          log_hash[:body] = e.body if @logger.debug? # Generally this is too verbose
          message = "Encountered a retryable error. Will Retry with exponential backoff "

          # We treat 429s as a special case because these really aren't errors, but
          # rather just ES telling us to back off a bit, which we do.
          # The other retryable code is 503, which are true errors
          # Even though we retry the user should be made aware of these
          if e.response_code == 429
            logger.debug(message, log_hash)
          else
            logger.error(message, log_hash)
          end

          sleep_interval = sleep_for_interval(sleep_interval)
          retry
        else
          log_hash = {:code => e.response_code, 
                      :response_body => e.response_body}
          log_hash[:request_body] = e.request_body if @logger.debug?
          @logger.error("Got a bad response code from server, but this code is not considered retryable. Request will be dropped", log_hash)
        end
      rescue => e
        # Stuff that should never happen
        # For all other errors print out full connection issues
        @logger.error(
          "An unknown error occurred sending a bulk request to Elasticsearch. We will retry indefinitely",
          :error_message => e.message,
          :error_class => e.class.name,
          :backtrace => e.backtrace
        )

        @logger.debug("Failed actions for last bad bulk request!", :actions => actions)

        # We retry until there are no errors! Errors should all go to the retry queue
        sleep_interval = sleep_for_interval(sleep_interval)
        retry unless @stopping.true?
      end
    end

    def supports_dlq?
      respond_to?(:execution_context) && execution_context.respond_to?(:dlq_writer)
    end
  end
end; end; end