-
Notifications
You must be signed in to change notification settings - Fork 1
/
sohoa.py
82 lines (74 loc) · 3.09 KB
/
sohoa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
import scrapy
from vnexpress.items import VnexpressItem
import json
class SohoaSpider(scrapy.Spider):
name = "sohoa"
allowed_domains = ["sohoa.vnexpress.net",
"usi.saas.vnexpress.net"]
start_urls = (
"http://sohoa.vnexpress.net/",
)
def parse(self, response):
# Parse articles
for article_url in response.xpath(
'//a[contains(@href, "sohoa.vnexpress.net")]/@href'
).re(r'.*-\d{7,}\.html$'):
yield scrapy.Request(article_url,
callback=self.parse_contents)
# Parse pages
for page in response.xpath('//a[contains(@href, "page")]/@href'
).extract():
yield scrapy.Request(page, self.parse)
def parse_contents(self, response):
item = VnexpressItem()
# Parse all things we can parse immediatly
post_date = response.css('div.block_timer::text'
).extract()
item['url'] = response.url
item['date'] = [p.strip() for p in post_date[:2]]
item['intro'] = response.css('div.short_intro::text'
).extract_first().strip()
item['title'] = response.xpath('//div[@class="title_news"]/h1/text()'
).extract_first().strip()
item['content'] = response.xpath(
'//div[contains(@class, "fck_detail")]//p//text()'
).extract()
item['tags'] = response.xpath('//a[@class="tag_item"]//text()'
).extract()
# IDs to get comments
site_id = response.xpath('//meta[@name="tt_site_id"]/@content'
).extract_first()
article_id = response.xpath('//meta[@name="tt_article_id"]/@content'
).extract_first()
# Set the limit
limit = 24
# URL for AJAX calling
URL = ('http://usi.saas.vnexpress.net/index/get?offset=0&limit={limit}'
'&sort=like&objectid={article_id}&objecttype=1&siteid={site_id}'
).format(limit=limit, article_id=article_id, site_id=site_id)
# Queue comment request for parsing later
request_comment = scrapy.Request(URL,
callback=self.parse_comment)
# Pass the incomplete item to callback
request_comment.meta['item'] = item
yield request_comment
def parse_comment(self, response):
# Now make the full item
item = response.meta['item']
item['comments'] = self._extract_comment(response.body)
yield item
@staticmethod
def _extract_comment(cont):
cont = json.loads(cont)
# If error, return nothing
if cont.get('error') != 0:
return []
# Else processing comments
lis = []
items = cont['data']['items']
for i in items:
lis.append(i['content'])
if i.get('replys'):
lis = lis + [r['content'] for r in i['replys']['items']]
return lis