Skip to content

Commit

Permalink
Adapter refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
nbulaj committed Nov 24, 2017
1 parent 6493936 commit 7ddfc50
Show file tree
Hide file tree
Showing 24 changed files with 230 additions and 148 deletions.
2 changes: 1 addition & 1 deletion .rubocop.yml
@@ -1,7 +1,7 @@
LineLength:
Max: 120
AllCops:
TargetRubyVersion: 2.4
TargetRubyVersion: 2.1
Exclude:
- 'spec/**/*'
- 'bin/*'
Expand Down
3 changes: 3 additions & 0 deletions Gemfile
Expand Up @@ -2,6 +2,9 @@ source 'https://rubygems.org'

gemspec

gem 'nokogiri', '~> 1.8'
gem 'oga', '~> 2.0'

group :test do
gem 'coveralls', require: false
gem 'evil-proxy'
Expand Down
9 changes: 5 additions & 4 deletions lib/proxy_fetcher.rb
@@ -1,6 +1,5 @@
require 'uri'
require 'net/https'
require 'nokogiri'

require File.dirname(__FILE__) + '/proxy_fetcher/exceptions'
require File.dirname(__FILE__) + '/proxy_fetcher/configuration'
Expand All @@ -15,9 +14,11 @@
require File.dirname(__FILE__) + '/proxy_fetcher/client/request'
require File.dirname(__FILE__) + '/proxy_fetcher/client/proxies_registry'

require File.dirname(__FILE__) + '/proxy_fetcher/parser/document'
require File.dirname(__FILE__) + '/proxy_fetcher/parser/node'
require File.dirname(__FILE__) + '/proxy_fetcher/parser/adapters/abstract'
require File.dirname(__FILE__) + '/proxy_fetcher/document'
require File.dirname(__FILE__) + '/proxy_fetcher/document/node'
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/abstract'
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/nokogiri'
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/oga'

module ProxyFetcher
module Providers
Expand Down
17 changes: 15 additions & 2 deletions lib/proxy_fetcher/configuration.rb
@@ -1,7 +1,7 @@
module ProxyFetcher
class Configuration
attr_accessor :providers, :timeout, :pool_size, :user_agent
attr_accessor :http_client, :proxy_validator
attr_accessor :timeout, :pool_size, :user_agent
attr_reader :http_client, :proxy_validator, :providers, :adapter

# rubocop:disable Metrics/LineLength
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112 Safari/537.36'.freeze
Expand Down Expand Up @@ -33,6 +33,19 @@ def reset!
@proxy_validator = ProxyValidator

self.providers = self.class.registered_providers
self.adapter = :nokogiri
end

def adapter=(name_or_class)
@adapter = case name_or_class
when Symbol, String
ProxyFetcher::Document::Adapters.const_get(name_or_class.to_s.capitalize)
else
name_or_class
end

@adapter.setup!
@adapter
end

def providers=(value)
Expand Down
@@ -1,8 +1,8 @@
module ProxyFetcher
class Document
class << self
def parse(data, adapter:, options: {})
new(adapter.parse(data, options))
def parse(data, adapter:)
new(adapter.parse(data))
end
end

Expand All @@ -17,7 +17,7 @@ def xpath(*args)
end

def css(*args)
backend.css(*args).map { |node| backend.proxy_node.new(node)}
backend.css(*args).map { |node| backend.proxy_node.new(node) }
end
end
end
27 changes: 27 additions & 0 deletions lib/proxy_fetcher/document/adapters/abstract.rb
@@ -0,0 +1,27 @@
module ProxyFetcher
class Document
module Adapters
class Abstract
attr_reader :doc

def initialize(doc)
@doc = doc
end

# You can override this method in you class
def xpath(selector)
doc.xpath(selector)
end

# You can override this method in you class
def css(selector)
doc.css(selector)
end

def proxy_node
::ProxyFetcher::Document::Node
end
end
end
end
end
45 changes: 45 additions & 0 deletions lib/proxy_fetcher/document/adapters/nokogiri.rb
@@ -0,0 +1,45 @@
module ProxyFetcher
class Document
module Adapters
class Nokogiri < Abstract
def self.setup!(*)
require 'nokogiri'
end

def self.parse(data)
new(::Nokogiri::HTML(data))
end

def proxy_node
Node
end

class Node < ProxyFetcher::Document::Node
def at_xpath(*args)
self.class.new(node.at_xpath(*args))
end

def at_css(*args)
self.class.new(node.at_css(*args))
end

def attr(*args)
node.attr(*args)
end

def content_at(*args)
clear(find(*args).content)
end

def content
node.content
end

def html
node.inner_html
end
end
end
end
end
end
45 changes: 45 additions & 0 deletions lib/proxy_fetcher/document/adapters/oga.rb
@@ -0,0 +1,45 @@
module ProxyFetcher
class Document
module Adapters
class Oga < Abstract
def self.setup!(*)
require 'oga'
end

def self.parse(data)
new(::Oga.parse_html(data))
end

def proxy_node
Node
end

class Node < ProxyFetcher::Document::Node
def at_xpath(*args)
self.class.new(node.at_xpath(*args))
end

def at_css(*args)
self.class.new(node.at_css(*args))
end

def attr(*args)
node.attribute(*args).value
end

def content_at(*args)
clear(find(*args).content)
end

def content
node.text
end

def html
node.to_xml
end
end
end
end
end
end
31 changes: 31 additions & 0 deletions lib/proxy_fetcher/document/node.rb
@@ -0,0 +1,31 @@
module ProxyFetcher
class Document
class Node
attr_reader :node

def initialize(node)
@node = node
end

def find(selector, method = :at_xpath)
self.class.new(node.public_send(method, selector))
end

def content
raise "#{__method__} must be implemented in descendant class!"
end

def html
raise "#{__method__} must be implemented in descendant class!"
end

protected

def clear(text)
return if text.nil? || text.empty?

text.strip.gsub(/[ \t]/i, '')
end
end
end
end
9 changes: 0 additions & 9 deletions lib/proxy_fetcher/parser.rb

This file was deleted.

23 changes: 0 additions & 23 deletions lib/proxy_fetcher/parser/adapters/abstract.rb

This file was deleted.

7 changes: 0 additions & 7 deletions lib/proxy_fetcher/parser/adapters/nokogiri/node.rb

This file was deleted.

24 changes: 0 additions & 24 deletions lib/proxy_fetcher/parser/adapters/nokogiri/nokogiri.rb

This file was deleted.

21 changes: 0 additions & 21 deletions lib/proxy_fetcher/parser/node.rb

This file was deleted.

7 changes: 4 additions & 3 deletions lib/proxy_fetcher/providers/base.rb
Expand Up @@ -28,7 +28,8 @@ def load_document(url, filters = {})
uri = URI.parse(url)
uri.query = URI.encode_www_form(filters) if filters && filters.any?

Nokogiri::HTML(ProxyFetcher.config.http_client.fetch(uri.to_s))
html = ProxyFetcher.config.http_client.fetch(uri.to_s)
ProxyFetcher::Document.parse(html, adapter: ProxyFetcher.config.adapter)
end

# Get HTML elements with proxy info
Expand All @@ -42,8 +43,8 @@ def to_proxy(*)
end

# Return normalized HTML element content by selector
def parse_element(parent, selector, method = :at_xpath)
clear(parent.public_send(method, selector).content)
def extract_content(node, selector, method = :at_xpath)
clear(node.public_send(method, selector).parse)
end
end
end
Expand Down
16 changes: 8 additions & 8 deletions lib/proxy_fetcher/providers/free_proxy_list.rb
Expand Up @@ -9,20 +9,20 @@ def load_proxy_list(*)
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
end

def to_proxy(html_element)
def to_proxy(html_node)
ProxyFetcher::Proxy.new.tap do |proxy|
proxy.addr = parse_element(html_element, 'td[1]')
proxy.port = convert_to_int(parse_element(html_element, 'td[2]'))
proxy.country = parse_element(html_element, 'td[4]')
proxy.anonymity = parse_element(html_element, 'td[5]')
proxy.type = parse_type(html_element)
proxy.addr = html_node.content_at('td[1]')
proxy.port = convert_to_int(html_node.content_at('td[2]'))
proxy.country = html_node.content_at('td[4]')
proxy.anonymity = html_node.content_at('td[5]')
proxy.type = parse_type(html_node)
end
end

private

def parse_type(element)
https = parse_element(element, 'td[6]')
def parse_type(html_node)
https = html_node.content_at('td[6]')
https && https.casecmp('yes').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
end
end
Expand Down
10 changes: 5 additions & 5 deletions lib/proxy_fetcher/providers/free_proxy_list_ssl.rb
Expand Up @@ -9,12 +9,12 @@ def load_proxy_list(*)
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
end

def to_proxy(html_element)
def to_proxy(html_node)
ProxyFetcher::Proxy.new.tap do |proxy|
proxy.addr = parse_element(html_element, 'td[1]')
proxy.port = convert_to_int(parse_element(html_element, 'td[2]'))
proxy.country = parse_element(html_element, 'td[4]')
proxy.anonymity = parse_element(html_element, 'td[5]')
proxy.addr = html_node.content_at('td[1]')
proxy.port = convert_to_int(html_node.content_at('td[2]'))
proxy.country = html_node.content_at('td[4]')
proxy.anonymity = html_node.content_at('td[5]')
proxy.type = ProxyFetcher::Proxy::HTTPS
end
end
Expand Down

0 comments on commit 7ddfc50

Please sign in to comment.