-
Notifications
You must be signed in to change notification settings - Fork 2
/
productpage.py
91 lines (79 loc) · 2.62 KB
/
productpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re, helpers, traceback
#Regex Compile
regex_dollarsign = re.compile(r'\$')
regex_by = re.compile(r'by')
regex_comma = re.compile(r',')
def process_product_page(product_url):
"""Scrapes and returns all the data in the given product url"""
if helpers.get_debug_config():
print "[DEBUG] Extracting manufacturer, price, and sold by from \
product page: " + product_url
product_page_lxml = helpers.get_page(product_url)
if product_page_lxml == False:
print "Scrape of product page failed!!! URL: " + product_url
helpers.log("Scrape of product page failed!!! URL: " + product_url)
return 'Error', 'Error', 'Error'
try:
manufacturer = extract_manufacturer(product_page_lxml)
price = extract_price(product_page_lxml)
sold_by = extract_sold_by(product_page_lxml)
except Exception, e:
print e
print traceback.print_exc()
helpers.log('Error with url: ' + product_url)
raise
return manufacturer, price, sold_by
def extract_sold_by(product_page_lxml):
"""Extracts who the product is sold by given a product page html in
unicode"""
sold_by = u'Not Sold by Amazon'
try:
results = product_page_lxml.cssselect('div.buying')
for search_result in results:
search_result = search_result.text_content()
if 'Fulfilled by Amazon' in search_result:
sold_by = u'Fulfilled by Amazon'
break
elif 'Ships from and sold by Amazon.com' in search_result:
sold_by = u'Sold by Amazon'
break
elif 'Ships from and sold by Amazon Digital Services' in \
search_result:
sold_by = u'Sold by Amazon'
break
else:
continue
except:
helpers.log('Sold by error on page')
sold_by = 'Error'
return sold_by
def extract_price(product_page_lxml):
"""Extracts the price of object given a product page html in unicode"""
try:
price = product_page_lxml.cssselect('span#actualPriceValue')
if price != [] and "$" in price[0].text_content():
price = re.sub(regex_dollarsign, '', price[0].text_content())
if ',' in price:
price = re.sub(regex_comma, '', price)
price = float(price)
else:
price = 0.0
except:
helpers.log('Price error on page')
price = 'Error'
return price
def extract_manufacturer(product_page_lxml):
"""Extracts the manufacturer given a product page html in unicode"""
manufacturer = 'Error'
try:
results = product_page_lxml.cssselect('div.buying')
for result in results:
test = result.cssselect('h1.parseasinTitle')
if test != []:
manufacturer = re.sub(regex_by, '', \
result.cssselect('span')[1].text_content()).strip()
break
except:
helpers.log('Manufacturer error on page')
manufacturer = 'Error'
return helpers.remove_unsafe_chars(manufacturer)