-
Notifications
You must be signed in to change notification settings - Fork 0
/
hannah_test.py
73 lines (67 loc) · 2.93 KB
/
hannah_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
import scrapy
import locale
import time
from datetime import datetime
#import datetime
locale.setlocale(locale.LC_ALL,("de_DE.utf8"))
#Helper Functions to parse the given date-format (German Months, without explicit year given )
from dateutil import parser
from dateutil.relativedelta import relativedelta
def parse_future(timestr, **parse_kwargs):
"""Same as dateutil.parser.parse() but only returns future dates."""
now = datetime.now().date()
try:
dt = parser.parse(timestr, parserinfo=GermanParserInfo(), **parse_kwargs).date()
except ValueError:
pass
if dt >= now: # original date is TODAY OR in future
pass
else:
dt += relativedelta(years=+1)
if dt >= now: # future date is in next year
pass
else:
print("No Future date found")
return dt
class GermanParserInfo(parser.parserinfo):
MONTHS = [
('Jan'),('Feb'),('Mrz'),('Apr'),('Mai'),('Jun'),('Jul'),('Aug'),('Sep'),('Okt'),('Nov'),('Dez'),
]
#testdate = datetime.date(2013,3,3)
#print(testdate.strftime("%d.%b %Y"))
class HannahSpider(scrapy.Spider):
name = 'hannah_v1'
allowed_domains = ['hannah-lastenrad.de']
start_urls_prefix = "https://www.hannah-lastenrad.de/cb-items/hannah-"
start_urls = []
hannah_ids = range(1,19) + [21]
for i in range(1,19):
start_url = ''.join([start_urls_prefix,str(i),'//'])
start_urls.append(start_url)
#start_urls = ['http://https://www.hannah-lastenrad.de//']
def parse(self, response):
print "URL: " + response.url
#print(response.xpath('//li[contains(@class, "bookable")]//span/text()').extract())
bookable_dates_iterator = iter(response.xpath('//li[contains(@class, "bookable")]//span/text()').extract())
bookable_dates = [c.zfill(3) +' ' +next(bookable_dates_iterator,'') for c in bookable_dates_iterator]
bookable_dates = [''.join([t,'2018']) for t in bookable_dates]
print(list(bookable_dates))
try:
print(bookable_dates)
print(bookable_dates_iterator)
bookable_date_parsed = [parse_future(t) for t in bookable_dates]
print(bookable_date_parsed)
except:
print("mooep")
time.sleep(1)
location_timeframend = [x[-8:] for x in response.xpath("//span[@class='cb-date']/text()").extract()]
yield {
'hannah_name': response.xpath('//h1/text()').extract_first(),
'location_names': [x.strip() for x in response.xpath("//div[@class='cb-location-name cb-big']/text()").extract()],
'location_gmaps_links': [x.strip() for x in response.xpath("//div[@class='cb-address cb-row']/a/@href").extract()],
'location_timeframe': [parse_future(t) for t in [x.strip() for x in location_timeframend]],
'bookable_dates': [parse_future(t) for t in bookable_dates]
}
pass
#[datetime.strptime(t, '%d.%m.%Y') for t in