Skip to content

Commit

Permalink
Messing with threading
Browse files Browse the repository at this point in the history
  • Loading branch information
nmichalov committed Mar 9, 2012
1 parent df4550a commit 991212a
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
22 changes: 15 additions & 7 deletions Components/Crawl/CrawlDirector.py
Expand Up @@ -4,6 +4,7 @@
import Pyro4
import os
import cPickle
import threading

class Director:

Expand Down Expand Up @@ -31,21 +32,28 @@ def update_record(self):

def main():
ns_host = raw_input('enter nameserver ip: ')
crawler_count = int(raw_input('enter the number of crawler instances: '))
director = Director()
ns = Pyro4.naming.locateNS(ns_host)
crawler_uri = ns.lookup('Crawler')
crawler = Pyro4.Proxy(crawler_uri)
urls = []
url_file = open('URLlist', 'r')
for line in url_file:
line = line.strip()
director.add_new(line)
target_urls = director.new_urls()
for link in target_urls:
try:
crawler.crawl(link)
except:
pass
batch_size = len(target_urls)/crawler_count
batch_dict = {}
for i in range(crawler_count):
if i < (crawler_count - 1):
batch_dict[i] = target_urls[i:(i+1)*batch_size]
else:
batch_dict[i] = target_urls[i::]
for i in range(crawler_count):
print 'Enter identifier for crawler %s' % (str(i))
crawler_ident = raw_input(': ')
crawler_uri = ns.lookup('Crawler'+crawler_ident)
crawler = Pyro4.Proxy(crawler_uri)
threading.Thread.start(crawler.start_crawl(batch_dict[i]))
director.update_record()

if __name__ == "__main__":
Expand Down
9 changes: 8 additions & 1 deletion Components/Crawl/DistCrawler.py
Expand Up @@ -20,6 +20,12 @@ def __init__(self):
self.br = mechanize.Browser()
self.br.addheaders = [('user-agent', 'https://github.com/nmichalov')]

def start_crawl(self, target_list):
self.internal_urls = target_list
print self.internal_urls
first_target = self.internal_urls.pop()
self.crawl(first_target)

def crawl(self, target):
self.visited.append(target)
current_url_parts = urlparse.urlparse(target)
Expand Down Expand Up @@ -54,10 +60,11 @@ def crawl(self, target):

if __name__ == '__main__':
hostname = raw_input('enter host ip: ')
ident = raw_input('enter crawler identifier: ')
crawler = Crawler()
Pyro4.Daemon.serveSimple(
{
crawler: 'Crawler'
crawler: 'Crawler%s' % (ident)
},
host = hostname,
ns = True, verbose = True)

0 comments on commit 991212a

Please sign in to comment.