reconstructed project directories

mssun · Mar 11, 2014 · 996265a · 996265a
1 parent eabd519
commit 996265a
Show file tree

Hide file tree

Showing 11 changed files with 26 additions and 20 deletions.
diff --git a/android_apps_crawler/__init__.py → crawler/android_apps_crawler/__init__.py b/android_apps_crawler/__init__.py → crawler/android_apps_crawler/__init__.py
diff --git a/android_apps_crawler/custom_parser.py → ...ler/android_apps_crawler/custom_parser.py b/android_apps_crawler/custom_parser.py → ...ler/android_apps_crawler/custom_parser.py
diff --git a/android_apps_crawler/items.py → crawler/android_apps_crawler/items.py b/android_apps_crawler/items.py → crawler/android_apps_crawler/items.py
diff --git a/android_apps_crawler/middlewares.py → crawler/android_apps_crawler/middlewares.py b/android_apps_crawler/middlewares.py → crawler/android_apps_crawler/middlewares.py
diff --git a/android_apps_crawler/pipelines.py → crawler/android_apps_crawler/pipelines.py b/android_apps_crawler/pipelines.py → crawler/android_apps_crawler/pipelines.py
@@ -19,6 +19,8 @@ def __init__(self):
         for d in settings.ALLOWED_DOMAINS:
             self.filename += d
         self.filename += ".db"
+        self.filename = path.join(settings.DATABASE_DIR, self.filename)
+        print self.filename
         self.conn = None
         dispatcher.connect(self.initialize, signals.engine_started)
         dispatcher.connect(self.initialize, signals.engine_stopped)
@@ -39,15 +41,15 @@ def initialize(self):
             self.conn = sqlite3.connect(self.filename)
         else:
             self.create_table()
-        self.conn.execute("PRAGMA journal_mode=WAL;")
+        # self.conn.execute("PRAGMA journal_mode=WAL;")
         self.conn.commit()
-        
+
     def finalize(self):
         if self.conn is not None:
             self.conn.commit()
             self.conn.close()
             self.conn = None
-    
+
     def create_table(self):
         self.conn = sqlite3.connect(self.filename)
         self.conn.execute("create table apps( \

diff --git a/android_apps_crawler/settings.py → crawler/android_apps_crawler/settings.py b/android_apps_crawler/settings.py → crawler/android_apps_crawler/settings.py
@@ -11,10 +11,10 @@
 SPIDER_MODULES = ['android_apps_crawler.spiders']
 NEWSPIDER_MODULE = 'android_apps_crawler.spiders'
 USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11(KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"
-ITEM_PIPELINES = [
-    'android_apps_crawler.pipelines.AppPipeline',
-    'android_apps_crawler.pipelines.SQLitePipeline'
-]
+ITEM_PIPELINES = {
+    'android_apps_crawler.pipelines.AppPipeline': 1,
+    'android_apps_crawler.pipelines.SQLitePipeline': 2,
+}
 LOG_LEVEL = 'INFO'
 DOWNLOADER_MIDDLEWARES = {
     'android_apps_crawler.middlewares.DownloaderMiddleware': 1,
@@ -27,23 +27,25 @@
     #"appchina.com",
     #"hiapk.com",
     #"anzhi.com",
-    "android.d.cn",
+    #"android.d.cn",
     #"mumayi.com",
     #"gfan.com",
     #"nduoa.com",
     #"3gyu.com",
     #"angeeks.com",
+    "appfun.cn",
 ]
 START_URLS = [
     #"http://www.appchina.com",
     #"http://apk.hiapk.com",
     #"http://www.anzhi.com",
-    "http://android.d.cn",
+    #"http://android.d.cn",
     #"http://www.mumayi.com",
     #"http://apk.gfan.com",
     #"http://www.nduoa.com",
     #"http://www.3gyu.com",
     #"http://www.angeeks.com",
+    "http://www.appfun.cn",
 ]
 SCRAPE_RULES = {
     "xpath" : {
@@ -55,8 +57,10 @@
         "nduoa" : "//a[@class='d_pc_normal']/@href",
         "3gyu" : "//a[@class='ldownload']/@href",
         "angeeks" : "//div[@class='rgmainsrimg'][1]/a/@href",
+        "appfun" : "//a[@class='downcp']/@href",
     },
     "custom_parser" : {
         "anzhi" : "parse_anzhi",
     },
 }
+DATABASE_DIR = "../repo/databases/"
diff --git a/android_apps_crawler/spiders/__init__.py → .../android_apps_crawler/spiders/__init__.py b/android_apps_crawler/spiders/__init__.py → .../android_apps_crawler/spiders/__init__.py
diff --git a/...ps_crawler/spiders/android_apps_spider.py → ...ps_crawler/spiders/android_apps_spider.py b/...ps_crawler/spiders/android_apps_spider.py → ...ps_crawler/spiders/android_apps_spider.py
@@ -1,7 +1,7 @@
 import re
 
-from scrapy.spider import BaseSpider
-from scrapy.selector import HtmlXPathSelector
+from scrapy.spider import Spider
+from scrapy.selector import Selector
 from scrapy.http import Request
 from scrapy.http import HtmlResponse
 from scrapy import log
@@ -14,7 +14,7 @@
 from android_apps_crawler import custom_parser
 
 
-class AndroidAppsSpider(BaseSpider):
+class AndroidAppsSpider(Spider):
     name = "android_apps_spider"
     allowed_domains = settings.ALLOWED_DOMAINS
     start_urls = settings.START_URLS
@@ -34,7 +34,7 @@ def parse(self, response):
         for key in custom_parser_rule.keys():
             if key in response_domain:
                 appItemList.extend(
-                        getattr(custom_parser, custom_parser_rule[key])(response)) 
+                        getattr(custom_parser, custom_parser_rule[key])(response))
                 break
         #if "appchina" in response_domain:
         #    xpath = "//a[@id='pc-download' and @class='free']/@href"
@@ -50,8 +50,8 @@ def parse(self, response):
         #    appItemList.extend(self.parse_anzhi(response, xpath))
         #else:
         #    pass
-        hxs = HtmlXPathSelector(response)
-        for url in hxs.select('//a/@href').extract():
+        sel = Selector(response)
+        for url in sel.xpath('//a/@href').extract():
             url = urljoin(response.url, url)
             yield Request(url, meta=cookie, callback=self.parse)
 
@@ -71,18 +71,18 @@ def parse(self, response):
     #        appItem['url'] = url
     #        appItemList.append(appItem)
     #    return appItemList
-    
+
     def parse_xpath(self, response, xpath):
         appItemList = []
-        hxs = HtmlXPathSelector(response)
-        for url in hxs.select(xpath).extract():
+        sel = Selector(response)
+        for url in sel.xpath(xpath).extract():
             url = urljoin(response.url, url)
             log.msg("Catch an application: %s" % url, level=log.INFO)
             appItem = AppItem()
             appItem['url'] = url
             appItemList.append(appItem)
         return appItemList
-    
+
     #def parse_anzhi(self, response, xpath):
     #    appItemList = []
     #    hxs = HtmlXPathSelector(response)
@@ -94,4 +94,4 @@ def parse_xpath(self, response, xpath):
     #        appItemList.append(appItem)
     #    return appItemList
 
-            
+
diff --git a/scrapy.cfg → crawler/scrapy.cfg b/scrapy.cfg → crawler/scrapy.cfg
diff --git a/downloader.py → downloader/downloader.py b/downloader.py → downloader/downloader.py
diff --git a/repo/apps/README.md b/repo/apps/README.md