Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Trying to use tribalvibes' fork of Ohm

  • Loading branch information...
commit 1e272325c0cca1d6c8eee3c78d2f9d6c1bc6379c 1 parent 0d35240
@mcmire authored
View
12 Gemfile
@@ -5,11 +5,16 @@ source :rubygems
# right version of Bundler
gem 'bundler', '~> 1.1.0'
-gem 'rails', '3.2.3'
+# gem 'rails', '3.2.3'
+gem 'tzinfo' # activesupport should be requiring this but it doesn't :(
+gem 'activesupport', '3.2.3'
+gem 'actionpack', '3.2.3'
gem 'redis', '3.0.0'
-gem 'ohm', '0.1.5'
-gem 'ohm-contrib', '0.1.2', :require => 'ohm/contrib'
+# gem 'ohm', '0.1.5'
+gem 'ohm-contrib', '1.0.1', :require => false
+# CAN'T USE THIS... IT'S A FORK OF 0.1 AND OHM IS UP TO 1.0 NOW :(
+gem 'ohm', :git => 'http://github.com/tribalvibes/ohm'
#---
@@ -25,7 +30,6 @@ gem 'active_hash', '0.9.10'
gem 'colored', '1.2', :require => false
# Provides String#to_ascii which is useful when screenscraping
gem 'stringex', '1.4.0'
-# gem 'queue_classic', '2.0.0'
gem 'map', '5.7.0'
gem 'logging', '1.7.2'
gem 'logging-rails', '0.4.0'
View
43 Gemfile.lock
@@ -1,9 +1,13 @@
+GIT
+ remote: http://github.com/tribalvibes/ohm
+ revision: d862f19ffbb60d5958757c9a39ba09a73c37305c
+ specs:
+ ohm (0.1.3)
+ nest (~> 1.0)
+
GEM
remote: http://rubygems.org/
specs:
- actionmailer (3.2.3)
- actionpack (= 3.2.3)
- mail (~> 2.4.4)
actionpack (3.2.3)
activemodel (= 3.2.3)
activesupport (= 3.2.3)
@@ -19,19 +23,10 @@ GEM
activemodel (3.2.3)
activesupport (= 3.2.3)
builder (~> 3.0.0)
- activerecord (3.2.3)
- activemodel (= 3.2.3)
- activesupport (= 3.2.3)
- arel (~> 3.0.2)
- tzinfo (~> 0.3.29)
- activeresource (3.2.3)
- activemodel (= 3.2.3)
- activesupport (= 3.2.3)
activesupport (3.2.3)
i18n (~> 0.6)
multi_json (~> 1.0)
addressable (2.2.8)
- arel (3.0.2)
awesome_print (1.0.2)
bourbon (2.1.0)
sass (>= 3.1)
@@ -77,10 +72,6 @@ GEM
little-plugger (>= 1.1.3)
logging-rails (0.4.0)
logging (~> 1.6)
- mail (2.4.4)
- i18n (>= 0.4.0)
- mime-types (~> 1.16)
- treetop (~> 1.4.8)
map (5.7.0)
method_source (0.7.1)
mime-types (1.18)
@@ -90,11 +81,8 @@ GEM
redis
netrc (0.7.1)
nokogiri (1.5.2)
- ohm (0.1.5)
- nest (~> 1.0)
ohm-contrib (0.1.2)
ohm
- polyglot (0.3.3)
pry (0.9.9.6)
coderay (~> 1.0.5)
method_source (~> 0.7.1)
@@ -106,14 +94,6 @@ GEM
rack
rack-test (0.6.1)
rack (>= 1.0)
- rails (3.2.3)
- actionmailer (= 3.2.3)
- actionpack (= 3.2.3)
- activerecord (= 3.2.3)
- activeresource (= 3.2.3)
- activesupport (= 3.2.3)
- bundler (~> 1.0)
- railties (= 3.2.3)
railties (3.2.3)
actionpack (= 3.2.3)
activesupport (= 3.2.3)
@@ -150,9 +130,6 @@ GEM
rack (>= 1.0.0)
thor (0.14.6)
tilt (1.3.3)
- treetop (1.4.10)
- polyglot
- polyglot (>= 0.3.1)
tzinfo (0.3.33)
uglifier (1.2.4)
execjs (>= 0.3.0)
@@ -163,7 +140,9 @@ PLATFORMS
ruby
DEPENDENCIES
+ actionpack (= 3.2.3)
active_hash (= 0.9.10)
+ activesupport (= 3.2.3)
awesome_print
bourbon (~> 2.1.0)
bundler (~> 1.1.0)
@@ -178,10 +157,9 @@ DEPENDENCIES
map (= 5.7.0)
mustache (= 0.99.4)
nokogiri (= 1.5.2)
- ohm (= 0.1.5)
+ ohm!
ohm-contrib (= 0.1.2)
pry
- rails (= 3.2.3)
redis (= 3.0.0)
sass-rails (~> 3.2.3)
simple-navigation (= 3.7.0)
@@ -189,5 +167,6 @@ DEPENDENCIES
stringex (= 1.4.0)
therubyracer (~> 0.9.8)
thin (~> 1.3.1)
+ tzinfo
uglifier (>= 1.0.3)
yajl-ruby (= 1.1.0)
View
17 app/models/category.rb
@@ -1,22 +1,19 @@
class Category < Ohm::Model
include Hardwarepedia::ModelMixins::RequiresFields
- include Ohm::DataTypes
- include Ohm::Timestamps
+ include Ohm::Serialized
+
+ def self.with_chipsets
+ ["Graphics Cards"]
+ end
attribute :name
- attribute :webkey
- attribute :state, Type::Integer
+ attribute :webkey, :default => lambda {|c| c.name.parameterize }
+ attribute :state, :default => 0
unique :name
unique :webkey
requires_fields :name, :webkey, :state
-
- def initialize(attrs={})
- super(attrs)
- self.webkey ||= name.parameterize
- self.state ||= 0
- end
end
View
9 app/models/image.rb
@@ -1,13 +1,16 @@
class Image < Ohm::Model
- include Hardwarepedia::ModelMixins::RequiresFields
- include Ohm::DataTypes
+ include Ohm::Serialized
include Ohm::Timestamps
+ include Hardwarepedia::ModelMixins::RequiresFields
reference :reviewable, :Reviewable
+ reference :reviewable_url, :Url
attribute :url
attribute :caption
- requires_fields :reviewable_id, :url
+ unique :url
+
+ requires_fields :reviewable_id, :reviewable_url, :url
end
View
13 app/models/manufacturer.rb
@@ -1,20 +1,19 @@
class Manufacturer < Ohm::Model
+ include Hardwarepedia::ModelMixins::RequiresFields
+ include Ohm::Serialized
+
collection :reviewables
collection :products
collection :chipsets
- validates_presence_of :name
- validates_uniqueness_of :name
+ attribute :name
+ attribute :webkey, :default => lambda {|m| m.name.parameterize }
- before_save :_set_webkey
+ requires_fields :name, :webkey
def to_param
webkey
end
-
- def _set_webkey
- self.webkey = name.parameterize
- end
end
View
14 app/models/price.rb
@@ -1,5 +1,17 @@
-class Price < ActiveRecord::Base
+class Price < Ohm::Model
+ include Ohm::Serialized
+ include Ohm::Timestamps
+ include Hardwarepedia::ModelMixins::RequiresFields
+
+ reference :reviewable, :Reviewable
+ reference :reviewable_url, :Url
+ attribute :amount, Integer
+
+ unique :reviewable_url
+
+ requires_fields :reviewable_id, :reviewable_url, :amount
+
def retailer_name
@retailer_name ||= begin
host = URI.parse(url).host.sub(%r{^www\.}, "")
View
9 app/models/product.rb
@@ -1,12 +1,11 @@
class Product < Reviewable
- belongs_to :category
- belongs_to :manufacturer
- belongs_to :chipset
+ reference :chipset, :Chipset
- validates_presence_of :chipset_id, :if => [:complete?, :_chipset_needed?]
+ requires_fields :chipset_id,
+ :if => [:complete?, :_chipset_needed?]
def _chipset_needed?
- category.name == "Graphics Cards"
+ Category.with_chipsets.include?(category.name)
end
end
View
19 app/models/rating.rb
@@ -1,9 +1,24 @@
-class Rating < ActiveRecord::Base
- belongs_to :reviewable
+class Rating < Ohm::Model
+ include Ohm::Timestamps
+ include Hardwarepedia::ModelMixins::RequiresFields
+
+ reference :reviewable, :Reviewable
+ attribute :reviewable_url, :Url
+ attribute :raw_value
+ attribute :value, Float
+ attribute :num_reviews, Integer
+
+ unique :reviewable_url
+
+ requires_fields :reviewable_id, :reviewable_url, :raw_value, :value, :num_reviews
before_save :_interpret_raw_value
+ def before_save
+ _interpret_raw_value
+ end
+
def retailer_name
@retailer_name ||= begin
host = URI.parse(url).host.sub(%r{^www\.}, "")
View
33 app/models/reviewable.rb
@@ -5,44 +5,37 @@ class Reviewable < Ohm::Model
# field that holds info scraped from a URL
include Hardwarepedia::ModelMixins::RequiresFields
- include Ohm::DataTypes
+ include Ohm::Serialized
include Ohm::Timestamps
reference :manufacturer, :Manufacturer
reference :category, :Category
- reference :chipset, :Chipset
attribute :type # one of Chipset or Product
attribute :name
- attribute :full_name
- attribute :webkey
+ attribute :full_name, :default => lambda {|r| manufacturer && [manufacturer.name, name].join(" ") }
+ attribute :webkey, :default => lambda {|r| r.full_name.try(:parameterize) }
attribute :summary
- attribute :num_reviews, Type::Integer
- attribute :specs, Type::Hash
+ attribute :num_reviews, Integer
+ attribute :specs, Hash
set :content_urls
set :official_urls
set :mention_urls
- attribute :market_release_date, Type::Date
- attribute :state, Type::Integer
+ attribute :market_release_date, Date
+ attribute :state, Integer, :default => 0
- collection :images
- collection :prices
- collection :ratings
- collection :reviews
+ set :images, :Image
+ set :prices, :Price
+ set :ratings, :Rating
+ set :reviews, :Review
requires_fields \
:manufacturer_id, :category_id, :name, :first_price, :specs, :content_urls,
:if => :complete?
fails_save_with("Must have one price") {|r| r.prices.empty? }
- def initialize(attrs={})
- super(attrs)
- self.full_name ||= (
- manufacturer && [manufacturer.name, name].join(" ")
- )
- self.webkey ||= full_name.try(:parameterize)
- self.state ||= 0
- end
+ unique :full_name
+ unique :webkey
def incomplete?
state == 0
View
10 app/models/url.rb
@@ -3,9 +3,9 @@
class Url
include Hardwarepedia::ModelMixins::RequiresFields
- include Ohm::DataTypes
+ include Ohm::Serialized
include Ohm::Timestamps
- include Ohm::Expiration
+ include Ohm::Expiration # our extension
# Delete all urls, or urls of a certain type
def self.delete_all(opts={})
@@ -20,8 +20,10 @@ def self.delete_all(opts={})
attribute :url
attribute :content_html
attribute :content_digest
- attribute :state, Type::Integer
- attribute :last_fetched_at, Type::Time
+ attribute :state, Integer
+ attribute :last_fetched_at, Time
+
+ unique :url
expire_in 2.hours
View
4 config/application.rb
@@ -40,13 +40,13 @@ class Application < Rails::Application
# your app. As such, your models will need to explicitly whitelist or
# blacklist accessible parameters by using an attr_accessible or
# attr_protected declaration.
- config.active_record.whitelist_attributes = false
+ # config.active_record.whitelist_attributes = false
# Enable the asset pipeline
config.assets.enabled = true
# Version of your assets, change this if you want to expire all your assets
config.assets.version = '1.0'
- config.cache_store = [:file_store, Rails.root.join('tmp/cache'), :expires_in => 1.day]
+ # config.cache_store = [:file_store, Rails.root.join('tmp/cache'), :expires_in => 1.day]
end
end
View
6 config/environments/development.rb
@@ -14,7 +14,7 @@
config.action_controller.perform_caching = false
# Don't care if the mailer can't send
- config.action_mailer.raise_delivery_errors = false
+ # config.action_mailer.raise_delivery_errors = false
# Print deprecation notices to the Rails logger
config.active_support.deprecation = :log
@@ -23,11 +23,11 @@
config.action_dispatch.best_standards_support = :builtin
# Raise exception on mass assignment protection for Active Record models
- config.active_record.mass_assignment_sanitizer = :strict
+ # config.active_record.mass_assignment_sanitizer = :strict
# Log the query plan for queries taking more than this (works
# with SQLite, MySQL, and PostgreSQL)
- config.active_record.auto_explain_threshold_in_seconds = 0.5
+ # config.active_record.auto_explain_threshold_in_seconds = 0.5
# Do not compress assets
config.assets.compress = false
View
20 lib/hardwarepedia/extensions/vendor/ohm.rb
@@ -1,7 +1,23 @@
class Ohm::Model
- def self.find_or_create(attrs)
- find(attrs) || create(attrs)
+ def self.first_or_create(opts, attrs)
+ object = find(method, opts).first
+ if object
+ object.update_attributes(attrs)
+ object.save
+ else
+ create(attrs.merge(opts))
+ end
+ end
+
+ def self.with_or_create(opts, attrs)
+ object = with(method, opts)
+ if object
+ object.update_attributes(attrs)
+ object.save
+ else
+ create(attrs.merge(opts))
+ end
end
end
View
2  lib/hardwarepedia/model_mixins/requires_fields.rb
@@ -1,4 +1,6 @@
+require 'ohm/callbacks'
+
module Hardwarepedia
module ModelMixins
module RequiresFields
View
8 lib/hardwarepedia/scraper.rb
@@ -45,7 +45,7 @@ def visiting(page, url, type, &block)
page.preprocess!(node_set) if page.respond_to?(:preprocess!)
content_html = node_set.to_html
u2 = Url.new(content_html)
- if u = Url.find(url)
+ if u = Url.with(:url => url)
# logger.info "Url: #{url}"
# require 'diffy'
# diff = Diffy::Diff.new(u.content_html, content_html)
@@ -75,7 +75,9 @@ def visiting(page, url, type, &block)
else
# We haven't scraped this URL yet, so add it to the database.
logger.info "Haven't scraped <#{url}> yet, content md5 is #{content_digest}"
- u = Url.create(type, url,
+ u = Url.create(
+ :type => type,
+ :url => url,
:content_html => content_html,
:content => content_digest
)
@@ -88,7 +90,7 @@ def visiting(page, url, type, &block)
end
def find_or_create_category(category_name)
- Category.find_or_create(category_name)
+ Category.with_or_create(:name => category_name)
end
def scrape_products
View
58 lib/hardwarepedia/scraper/category_page_scraper.rb
@@ -3,12 +3,22 @@ module Hardwarepedia
class Scraper
class CategoryPageScraper
- def self.cache_key_namespace
- 'retailer_category_product_urls'
- end
+ KEY_NS = 'retailer_category_product_urls'
+ EXPIRES_IN = 1.day
def self.clear_cache
- Rails.cache.delete_matched(/^#{cache_key_namespace}::/)
+ db.hkeys(key[:retailers]).each do |rname|
+ db.del(key[:retailers][rname]
+ end
+ db.del(key[:retailers])
+ end
+
+ def self.db
+ Ohm.redis
+ end
+
+ def self.key
+ @key ||= Nest.new(KEY_NS, db)
end
attr_reader :scraper, :page, :retailer, :category
@@ -26,16 +36,8 @@ def call
_scrape_product_urls(product_urls)
end
- def _cache_key
- Hardwarepedia::Util.cache_key(
- self.class.cache_key_namespace,
- @retailer.name,
- @category.name,
- )
- end
-
def _get_category_product_urls
- Rails.cache.fetch(_cache_key) {
+ cached {
# Visit the first page to get the total number of pages, then use the
# pagination links on that page to go through the rest of the pages
# and get a list of product urls
@@ -67,9 +69,7 @@ def _collect_remaining_product_urls!(doc, all_product_urls)
old_each_url = each_url
each_url = proc do |page_url|
threads << Thread.new do
- ActiveRecord::Base.connection_pool.with_connection do
- old_each_url.call(page_url)
- end
+ old_each_url.call(page_url)
end
end
old_each_urls = each_urls
@@ -109,9 +109,7 @@ def _scrape_product_urls(all_product_urls)
old_each_url = each_url
each_url = proc do |url|
threads << Thread.new do
- ActiveRecord::Base.connection_pool.with_connection do
- old_each_url.call(url)
- end
+ old_each_url.call(url)
end
end
old_each_urls = each_urls
@@ -129,6 +127,28 @@ def _scrape_product_urls(all_product_urls)
each_urls.call(all_product_urls)
end
+
+ def cached(&block)
+ rname = @retailer.name
+ cname = @category.name
+ if json = db.hget(key[:retailers][rname], cname)
+ Yajl::Parser.parse(json)
+ else
+ all_product_urls = block.call
+ db.sadd(key[:retailers], rname)
+ db.hset(key[:retailers][rname], cname, Yajl::Encoder.encode(json))
+ db.expire(key[:retailer][rname], EXPIRES_IN)
+ all_product_urls
+ end
+ end
+
+ def key
+ self.class.key
+ end
+
+ def db
+ self.class.db
+ end
end
end
View
103 lib/hardwarepedia/scraper/product_page_scraper.rb
@@ -44,20 +44,20 @@ def call
@product.chipset = @chipset =
_find_or_create_chipset_product(chipset_model_name)
- product.images = _scrape_images
- product.prices = _scrape_prices
- product.ratings = _scrape_ratings
+ _scrape_images
+ _scrape_prices
+ _scrape_ratings
- logger.info "Saving product record for '#{product.full_name}'"
- product.state = 1
+ logger.info "Saving product record for '#{@product.full_name}'"
+ @product.state = 1
# Double check that the product doesn't already exist in the event that
# it has already been processed in another thread
- if existing_product = Product.where(:full_name => product.full_name, :state => 1).first
- existing_product.update_attributes!(product.attributes)
- else
- product.save!
- end
+ # if existing_product = Product.find(:full_name => product.full_name, :state => 1).first
+ # existing_product.update_attributes!(product.attributes)
+ # else
+ @product.save!
+ # end
end
rescue Error => e
logger.error e
@@ -77,32 +77,16 @@ def _scrape_manufacturer_name
end
def _find_or_create_product
- if product = Product.where(:full_name => @full_name).first
- logger.info "(Found product '#{@full_name}', updating)"
- product.state = 0
- product.save!
- else
- logger.info "Creating product '#{@full_name}'"
- product = Product.create!(
- :category => @category,
- :name => @model_name,
- :full_name => @full_name,
- :state => 0
- )
- end
- product
+ Product.with_or_create({:full_name => @full_name},
+ :category => @category,
+ :name => @model_name,
+ :full_name => @full_name,
+ :state => 0
+ )
end
- def _find_or_create_manufacturer(manufacturer_name, opts={})
- type = "manufacturer"
- type = "chipset #{type}" if opts[:chipset]
- if manufacturer = Manufacturer.where(:name => manufacturer_name).first
- logger.info "(Reading #{type} '#{manufacturer_name}' from cache)"
- else
- logger.info "Creating #{type} '#{manufacturer_name}'"
- manufacturer = Manufacturer.create!(:name => manufacturer_name)
- end
- manufacturer
+ def _find_or_create_manufacturer(manufacturer_name)
+ Manufacturer.with_or_create(:name => manufacturer_name)
end
def _scrape_chipset_manufacturer_name
@@ -114,24 +98,17 @@ def _scrape_chipset_model_name
end
def _find_or_create_chipset_product(chipset_model_name)
- chipset_full_name = "#{@chipset_manufacturer.name} #{chipset_model_name}"
- if chipset = @chipset_manufacturer.products.where(:name => chipset_model_name).first
- logger.info "(Found chipset product '#{chipset_full_name}')"
- else
- logger.info "Creating chipset product '#{chipset_full_name}'"
- chipset = Chipset.create!(
- :manufacturer => @chipset_manufacturer,
- :category => @category,
- :name => chipset_model_name,
- :state => 0
- )
+ Chipset.with_or_create({:full_name => chipset_full_name},
+ :manufacturer => @chipset_manufacturer,
+ :category => @category,
+ :name => chipset_model_name,
+ :state => 0
# Eventually we will want to copy some of the attributes from this
# implementation product...
- end
+ )
end
def _scrape_images
- images = []
thumb_links = @doc.xpath('.//ul[contains(@class, "navThumbs")]//a')
for thumb_link in thumb_links
# this will give me back xml - i can read the fset element and get dx and dy to get the image dimensions
@@ -145,38 +122,35 @@ def _scrape_images
# This image won't have an onmouseover, so we will want to ignore it.
if thumb_link["onmouseover"]
thumb_url = thumb_link["onmouseover"].
+ # TODO: Does this really have two d's?
sub(/^Biz\.Product\.DetailPage\.swapProductImageWithLoadding\('/, "").
sub(/',this\.href,''\);$/, "")
caption = thumb_link["title"]
# We have the url of the thumbnail but we need a url of the entire image
url = thumb_url.sub(/\?.+$/, "") + "?scl=2.4"
- unless @product.images.where(:url => url).exists?
- images << Image.create!(
- :product => @product,
- :url => url,
- :caption => caption
- )
- end
+ image = Image.with_or_create({:url => url},
+ :reviewable => @product,
+ :reviewable_url => @product_url,
+ :caption => caption
+ )
+ @product.images.add(image)
end
- images
end
end
def _scrape_prices
- prices = []
- # Are you serious
+ # Are you serious...
sku = @doc.at_xpath('.//div[@id="bcaBreadcrumbTop"]//dd[last()]').
text.sub(/^Item[ ]*#:[ ]*/, "").to_ascii.strip
javascript = fetch("http://content.newegg.com/LandingPage/ItemInfo4ProductDetail.aspx?Item=#{sku}")
json = javascript.sub(/^\s*var Product={};\s*var rawItemInfo=/m, "").sub(/;\s*Product=rawItemInfo;\s*$/m, "")
hash = JSON.parse(json)
- amount = hash['finalPrice']
- prices << Price.create!(
+ amount = (hash['finalPrice'].to_f * 100).to_i
+ price = Price.with_or_create({:reviewable_url => @product_url},
:product => @product,
- :url => product_url,
:amount => amount
)
- prices
+ @product.prices.add(price)
end
def _scrape_ratings
@@ -185,15 +159,14 @@ def _scrape_ratings
rating_node = @doc.at_xpath('.//div[contains(@class, "grpRating")]//a[contains(@class, "itmRating")]/span')
# Some products will naturally not have any reviews yet, so there is no rating.
if rating_node && rating_raw_value = rating_node.text.presence
- num_reviews = rating_node.next.text.scan(/\d+/).first
- ratings << Rating.create!(
+ num_reviews = rating_node.next.text.scan(/\d+/).first.to_i
+ rating = Rating.with_or_create({:reviewable_url => product_url},
:product => @product,
- :url => product_url,
:raw_value => rating_raw_value,
:num_reviews => num_reviews
)
+ @product.ratings.add(rating)
end
- ratings
end
end
View
2  lib/tasks/scrape.rake
@@ -3,6 +3,7 @@ namespace :scrape do
task :init => :environment do
$stdout.sync = true # disable buffering
+=begin
if Rails.env.production?
# Load all the eager load paths since Rails does not do this when running
# Rake tasks, even in production mode
@@ -14,6 +15,7 @@ namespace :scrape do
# "LoadError: Expected app/models/product.rb to define Product"
Dir[ Rails.root.join("app/models/**/*.rb") ].each {|fn| require_dependency fn }
end
+=end
end
def clear_all_the_things
Please sign in to comment.
Something went wrong with that request. Please try again.