In [1]:
import queue

q = queue.Queue()

In [2]:
import time
import datetime
import pymongo
from urllib.request import Request, urlopen
import Parser

class Crawler:
    
    def __init__(self, batch_limit=5000):
        
        # Connect to mongo instance
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        db = myclient['1001']
        self.url_html_map = db['url_html_map']
        
        # Instantiate parser
        self.Parser = Parser.Parser()

        # Instantiate page_hash - maps already found urls to html
        # Used to condition on if we've visited a page
        self.page_hash = {}
        for page in self.url_html_map.find({}):
            self.page_hash[page['url']] = 1
            
        # Stopping indicator
        self.stop_search = False
        self.batch_limit = batch_limit
        
        # Var for ensuring loops dont occur
        self.urls_visited = {}
        
        self.last_request = time.time()

    def find_str(self, s, char, start_index=0):

        index = 0
        s = s[start_index+1:]
        if char in s:
            c = char[0]
            for ch in s:
                if ch == c:
                    if s[index:index+len(char)] == char:
                        return start_index + 1 + index
                index += 1
        return -1

    def request(self,url):

        req = Request(url,\
                      headers={'User-Agent': 'Mozilla/5.0'})
        html = str(urlopen(req).read())
        return html
    
    def request_db(self, url):
        
        htmls = []
        for page in self.url_html_map.find({'url': url}):
            htmls.append(page['html'])
        
        if len(htmls) == 1:
            return htmls[0]
        if len(htmls) == 0:
            return False
        if len(htmls) > 1:
            return htmls[0]
        
    def parse(self, url, html): 

        self.Parser.parse(url, html)
    
    def crawl(self, url):
        
        # Insert start url into queue
        self.q = queue.Queue()
        self.q.put(url)
        
        # Iterate until queue is empty
        while not self.q.empty():
            
            # Pop from queue (fifo)
            url = self.q.get()
        
            # Check for stopping conditions
            if self.Parser.tracklist_num == self.batch_limit:
                print('STOPPING SEARCH')
                self.stop_search = True
            print(url, self.page_hash.get(url, 0))
            if (self.stop_search):
                return

            # If in db then pull html
            if (self.page_hash.get(url, 0) == True):

                print('requesting db')
                html = self.request_db(url)
                if html == False:
                    print('MORE THAN ONE FOUND FOR:')
                    print(url)
                    continue

            # If not in db then http request
            if (self.page_hash.get(url, 0) == False):

                print('requesting http')
                # Only sleep if gap not enough
                if time.time() - self.last_request < 5:
                    time.sleep(5 - (time.time() - self.last_request))

                # Make http request
                try:
                    html = self.request(url)
                except:
                    print('REQUEST FAILED')
                    continue
                self.last_request = time.time()
                
                # If http requested then parse and extract necessary data 
                if ('/tracklist/' in url):
                    self.parse(url, html)
                    self.page_hash[url] = 1

            print('finding links')
            # Iterate over links found in html
            index = 0
            while self.find_str(html, 'a href="', index) != -1:

                # Extract url
                index = self.find_str(html, 'a href="', index)
                url_chunk = html[index:].split('"')[1]

                # Make sure it is either a referenced tracklist or 1001 page
                # and not already reached by search
                if ('/tracklist/' in url_chunk) and\
                   ('http' not in url_chunk) and\
                   ('#tlp' not in url_chunk) and\
                   (self.urls_visited.get(url_chunk, 0) == False): # <- compare short address to store from search

                    self.urls_visited[url_chunk] = 1 # only shortened address stored
                    new_page = 'https://www.1001tracklists.com' + url_chunk
                    self.q.put(new_page)

                elif ('/dj/' in url_chunk) and\
                     ('http' not in url_chunk) and\
                     ('#tlp' not in url_chunk) and\
                     (self.urls_visited.get(url_chunk, 0) == False):

                    self.urls_visited[url_chunk] = 1
                    new_page = 'https://www.1001tracklists.com' + url_chunk
                    self.q.put(new_page)

                elif ('www.1001tracklists.com' in url_chunk) and\
                     ('#tlp' not in url_chunk) and\
                     ('.xml' not in url_chunk) and\
                     (self.urls_visited.get(url_chunk, 0) == False):

                    self.urls_visited[url_chunk] = 1
                    self.q.put(url_chunk)

            # Cache url-html map
            self.page_hash[url] = html
      

In [3]:
crawler = Crawler(batch_limit=100000)

In [None]:
crawler.crawl('https://www.1001tracklists.com/')

https://www.1001tracklists.com/ 0
requesting http
finding links
https://www.1001tracklists.com/tracklist/2tmgf49t/hardwell-hardwell-on-air-430-2019-08-16.html 0
requesting http
Len tracklist docs: 1
Len played docs: 17
Len sequential docs: 16
Len track docs: 136
Len artist docs: 136
finding links
https://www.1001tracklists.com/tracklist/12p3x9pt/gorgon-city-factory-93-la-la-land-united-states-2019-08-17.html 0
requesting http
Len tracklist docs: 2
Len played docs: 34
Len sequential docs: 36
Len track docs: 346
Len artist docs: 215
finding links
https://www.1001tracklists.com/tracklist/2hcmuhvk/james-grant-the-anjunadeep-edition-265-2019-08-15.html 0
requesting http
Len tracklist docs: 3
Len played docs: 83
Len sequential docs: 90
Len track docs: 1831
Len artist docs: 1331
finding links
https://www.1001tracklists.com/tracklist/2wdvs1z9/henry-fong-1001tracklists-exclusive-mix-2019-08-20.html 0
requesting http
Len tracklist docs: 4
Len played docs: 104
Len sequential docs: 110
Len track d

Len tracklist docs: 32
Len played docs: 688
Len sequential docs: 752
Len track docs: 16604
Len artist docs: 10850
finding links
https://www.1001tracklists.com/tracklist/2spp34wt/marc-depulse-wilde-mohre-festival-germany-2019-08-10.html 0
requesting http
Len tracklist docs: 33
Len played docs: 705
Len sequential docs: 769
Len track docs: 16757
Len artist docs: 10995
finding links
https://www.1001tracklists.com/tracklist/1ncctlfk/tom-staar-f-me-im-famous-hi-ibiza-spain-2019-08-16.html 0
requesting http
Len tracklist docs: 34
Len played docs: 721
Len sequential docs: 798
Len track docs: 17192
Len artist docs: 11158
finding links
https://www.1001tracklists.com/tracklist/luhjnjk/steve-void-strange-fruits-radio-038-2019-08-19.html 0
requesting http
Len tracklist docs: 35
Len played docs: 741
Len sequential docs: 817
Len track docs: 17382
Len artist docs: 11348
finding links
https://www.1001tracklists.com/tracklist/1t0d5b19/sustance-skankandbass-on-reprezent-022-2019-08-20.html 0
requesting h

Len tracklist docs: 51
Len played docs: 978
Len sequential docs: 1065
Len track docs: 19625
Len artist docs: 13239
finding links
https://www.1001tracklists.com/dj/zedsdead/index.html 0
requesting http
finding links
https://www.1001tracklists.com/dj/floretloret/index.html 0
requesting http
finding links
https://www.1001tracklists.com/tracklist/195r08m9/zeds-dead-blvk-jvck-deadbeats-radio-111-2019-08-13.html 0
requesting http
Len tracklist docs: 52
Len played docs: 1008
Len sequential docs: 1094
Len track docs: 20060
Len artist docs: 13674
finding links
https://www.1001tracklists.com/dj/rogershah/index.html 0
requesting http
finding links
https://www.1001tracklists.com/tracklist/2cnf38l1/roger-shah-music-for-balearic-people-2019-08-06.html 0
requesting http
Len tracklist docs: 53
Len played docs: 1028
Len sequential docs: 1113
Len track docs: 20250
Len artist docs: 13846
finding links
https://www.1001tracklists.com/dj/djsfrommars/index.html 0
requesting http
finding links
https://www.100

finding links
https://www.1001tracklists.com/tracklist/195gppc9/hardwell-hardwell-on-air-417-2019-05-17.html 1
requesting db
