Skip to content

Commit

Permalink
reconstructed project directories
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingshen Sun committed Mar 11, 2014
1 parent eabd519 commit 996265a
Show file tree
Hide file tree
Showing 11 changed files with 26 additions and 20 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def __init__(self):
for d in settings.ALLOWED_DOMAINS:
self.filename += d
self.filename += ".db"
self.filename = path.join(settings.DATABASE_DIR, self.filename)
print self.filename
self.conn = None
dispatcher.connect(self.initialize, signals.engine_started)
dispatcher.connect(self.initialize, signals.engine_stopped)
Expand All @@ -39,15 +41,15 @@ def initialize(self):
self.conn = sqlite3.connect(self.filename)
else:
self.create_table()
self.conn.execute("PRAGMA journal_mode=WAL;")
# self.conn.execute("PRAGMA journal_mode=WAL;")
self.conn.commit()

def finalize(self):
if self.conn is not None:
self.conn.commit()
self.conn.close()
self.conn = None

def create_table(self):
self.conn = sqlite3.connect(self.filename)
self.conn.execute("create table apps( \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
SPIDER_MODULES = ['android_apps_crawler.spiders']
NEWSPIDER_MODULE = 'android_apps_crawler.spiders'
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11(KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"
ITEM_PIPELINES = [
'android_apps_crawler.pipelines.AppPipeline',
'android_apps_crawler.pipelines.SQLitePipeline'
]
ITEM_PIPELINES = {
'android_apps_crawler.pipelines.AppPipeline': 1,
'android_apps_crawler.pipelines.SQLitePipeline': 2,
}
LOG_LEVEL = 'INFO'
DOWNLOADER_MIDDLEWARES = {
'android_apps_crawler.middlewares.DownloaderMiddleware': 1,
Expand All @@ -27,23 +27,25 @@
#"appchina.com",
#"hiapk.com",
#"anzhi.com",
"android.d.cn",
#"android.d.cn",
#"mumayi.com",
#"gfan.com",
#"nduoa.com",
#"3gyu.com",
#"angeeks.com",
"appfun.cn",
]
START_URLS = [
#"http://www.appchina.com",
#"http://apk.hiapk.com",
#"http://www.anzhi.com",
"http://android.d.cn",
#"http://android.d.cn",
#"http://www.mumayi.com",
#"http://apk.gfan.com",
#"http://www.nduoa.com",
#"http://www.3gyu.com",
#"http://www.angeeks.com",
"http://www.appfun.cn",
]
SCRAPE_RULES = {
"xpath" : {
Expand All @@ -55,8 +57,10 @@
"nduoa" : "//a[@class='d_pc_normal']/@href",
"3gyu" : "//a[@class='ldownload']/@href",
"angeeks" : "//div[@class='rgmainsrimg'][1]/a/@href",
"appfun" : "//a[@class='downcp']/@href",
},
"custom_parser" : {
"anzhi" : "parse_anzhi",
},
}
DATABASE_DIR = "../repo/databases/"
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http import HtmlResponse
from scrapy import log
Expand All @@ -14,7 +14,7 @@
from android_apps_crawler import custom_parser


class AndroidAppsSpider(BaseSpider):
class AndroidAppsSpider(Spider):
name = "android_apps_spider"
allowed_domains = settings.ALLOWED_DOMAINS
start_urls = settings.START_URLS
Expand All @@ -34,7 +34,7 @@ def parse(self, response):
for key in custom_parser_rule.keys():
if key in response_domain:
appItemList.extend(
getattr(custom_parser, custom_parser_rule[key])(response))
getattr(custom_parser, custom_parser_rule[key])(response))
break
#if "appchina" in response_domain:
# xpath = "//a[@id='pc-download' and @class='free']/@href"
Expand All @@ -50,8 +50,8 @@ def parse(self, response):
# appItemList.extend(self.parse_anzhi(response, xpath))
#else:
# pass
hxs = HtmlXPathSelector(response)
for url in hxs.select('//a/@href').extract():
sel = Selector(response)
for url in sel.xpath('//a/@href').extract():
url = urljoin(response.url, url)
yield Request(url, meta=cookie, callback=self.parse)

Expand All @@ -71,18 +71,18 @@ def parse(self, response):
# appItem['url'] = url
# appItemList.append(appItem)
# return appItemList

def parse_xpath(self, response, xpath):
appItemList = []
hxs = HtmlXPathSelector(response)
for url in hxs.select(xpath).extract():
sel = Selector(response)
for url in sel.xpath(xpath).extract():
url = urljoin(response.url, url)
log.msg("Catch an application: %s" % url, level=log.INFO)
appItem = AppItem()
appItem['url'] = url
appItemList.append(appItem)
return appItemList

#def parse_anzhi(self, response, xpath):
# appItemList = []
# hxs = HtmlXPathSelector(response)
Expand All @@ -94,4 +94,4 @@ def parse_xpath(self, response, xpath):
# appItemList.append(appItem)
# return appItemList


File renamed without changes.
File renamed without changes.
Empty file added repo/apps/README.md
Empty file.

0 comments on commit 996265a

Please sign in to comment.