In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import random

ua = UserAgent() # From here we generate a random user agent
proxies = [] # Will contain proxies [ip, port]

# Retrieve latest proxies
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')

In [None]:
soup = BeautifulSoup(proxies_doc, 'html.parser')
proxies_table = soup.find(id='proxylisttable')

# Retrieve a random index proxy (we need the index to delete it if not working)
def random_proxy():
    return random.randint(0, len(proxies) - 1)

# Save proxies in the array
for row in proxies_table.tbody.find_all('tr'):
    proxies.append({
    'ip':   row.find_all('td')[0].string,
    'port': row.find_all('td')[1].string
    })

print('starting')
proxy_index = random_proxy()
proxy = proxies[proxy_index]
for n in range(1, 100):
    
    print(n)
    req = Request('https://www.1001tracklists.com/source/v7m7k3/the-anjunadeep-edition/index.html')
    req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')

    # Every 10 requests, generate a new proxy
    if n % 10 == 0:
        proxy_index = random_proxy()
        proxy = proxies[proxy_index]
        
    print('making call')
    # Make the call
    try:
        my_ip = urlopen(req, timeout=5).read().decode('utf8')
        print('#' + str(n))
    except Exception as e: # If error, delete this proxy and find another one
        print(e)
        del proxies[proxy_index]
        print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.')
        proxy_index = random_proxy()
        proxy = proxies[proxy_index]

### Crawler

In [None]:
import pymongo

myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient['1001']
print(len(list(mydb['track_docs'].find({}))))
print(len(list(mydb['artist_docs'].find({}))))
print(len(list(mydb['played_docs'].find({}))))

### Parser Class

In [None]:
import Parser
        
prsr = Parser.Parser()
prsr.parse(url='https://www.1001tracklists.com/tracklist/2rrcqmpk/modd-vulcan-gas-company-austin-united-states-2019-03-09.html',\
           html=prsr.request('https://www.1001tracklists.com/tracklist/2rrcqmpk/modd-vulcan-gas-company-austin-united-states-2019-03-09.html'))

### Crawler Class

In [1]:
import Crawler
import numpy as np

crawler = Crawler.Crawler(max_depth=20, batch_limit=np.inf)
crawler.start_crawl('https://www.1001tracklists.com/source/7x3gmv/afterlife-voyage/index.html')

Depth: 0
https://www.1001tracklists.com/source/7x3gmv/afterlife-voyage/index.html
Depth: 1
Depth: 1
Depth: 1
Depth: 1
Depth: 1
Depth: 1
Depth: 1
Depth: 1
Depth: 1
Depth: 1
Depth: 1
https://www.1001tracklists.com/tracklist/25152ntk/vaal-afterlife-voyage-002-2017-01-19.html
Len tracklist docs: 1
Len played docs: 14
Len sequential docs: 16
Len track docs: 136
Len artist docs: 85
Depth: 2
https://www.1001tracklists.com/dj/vaal/index.html
Depth: 3
Depth: 3
https://www.1001tracklists.com/tracklist/1s0vb4jk/vaal-eb-radio-dj-mix-2017-09-03.html
Len tracklist docs: 2
Len played docs: 22
Len sequential docs: 26
Len track docs: 191
Len artist docs: 111
Depth: 4
Depth: 4
Depth: 3
https://www.1001tracklists.com/tracklist/2gcnfsw1/vaal-ants-stage-tomorrowland-weekend-2-belgium-2017-07-30.html
Len tracklist docs: 3
Len played docs: 30
Len sequential docs: 37
Len track docs: 257
Len artist docs: 137
Depth: 4
Depth: 4
Depth: 3
Depth: 2
Depth: 1


### ParserSandbox


In [9]:
import ftfy
import urllib
from urllib.request import Request, urlopen

def find_str(s, char, start_index=0):
    '''
    Find substring char in string s. Found on internet, probably not efficient.
    
    '''
    index = 0
    s = s[start_index+1:]
    if char in s:
        c = char[0]
        for ch in s:
            if ch == c:
                if s[index:index+len(char)] == char:
                    return start_index + 1 + index
            index += 1
    return -1

def extract_value(html, key_value):
    
    content_string = html.strip(key_value).split('>')[0].strip('"')
    return content_string

def fix_decoding_errors(string):
    '''
    Fix UTF-8 decoding issues. Probably need to find more systematic/thorough approach to this.
    
    REPLACE THIS WITH ftfy.fix_text() -- python package which should be one stop shop for fixes
    '''
    string = string.replace('&amp;','&')
    string = string.replace('&#39;',"'")
    string = string.replace('\\xc3\\xb6','o')
    string = string.replace('\\xc3\\xab','e')
    string = string.replace('\\xc3\\x9','u')
    string = string.replace('\\xc3\\xb8','o')
    string = string.replace("\\'","'")
    
    return ftfy.fix_text(string)

def parse_track_and_artist(track_string):
    '''
    Extract the artist, track name, and remixer (if any) from the standard formatting used by 1001.
    
    '''
    # Check if Remix/Bootleg/Edit and parse accordingly
    if ('Remix' in track_string) or ('Bootleg' in track_string) or ('Edit' in track_string):
        
        artist, track_remixer = [string.strip(' ') for string in track_string.split(' - ')]
        track_remixer = [string.strip(' ') for string in track_remixer.split('(')]
        
        if len(track_remixer) > 2:
            track = track_remixer[0]
            remixer = '('.join(track_remixer[1:])
        else:
            track, remixer = track_remixer
            remixer = remixer.rstrip('Remix)').strip(' ')
        
    # If not remix, then should follow standard layout "Artist Name - Track Name"
    # This layout is expressed explicitly in html
    else:
        
        artist, track = [string.strip(' ') for string in track_string.split(' - ')]
        remixer = 'N/A'

    # Check for multiple artists -- Big Room sets tend to have hella mashups
    # Sometimes there is more structured formatting to exploit i.e. (Artist1 vs. Artist2 - Track1 vs. Track2)
    # Not worrying about that now b/c big room sux
    if 'vs.' in artist:
        artist = artist.replace('vs.','&')
    if '&' in artist:
        artist = [a.strip(' ') for a in artist.split('&')]
    
    # Remove features
    # We could make features a separate field but for now just removing
    if isinstance(artist, str):
        if ('feat.' in artist) or ('ft.' in artist):
            artist = artist.split('feat.')[0].strip(' ')
            artist = artist.split('ft.')[0].strip(' ')
    if isinstance(artist, list):
            artist = [a.split('feat.')[0].split('ft.')[0].strip(' ') for a in artist]
        
    if isinstance(artist, list):
        return (artist, track, remixer)
    else:
        return ([artist], track, remixer)


def tracklist_meta_data(html):
    '''
    Extract meta data about tracklist/set.
    
    '''
    meta_data = {}
    
    # Extract set description
    index = 0
    start_term = 'meta name="description" content="'
    index = find_str(html, start_term, index)
    description = html[index:].split('>')[0]
    description = description.lstrip(start_term).rstrip('"')
    meta_data['description'] = description
    
    # Set creation date - This should probably be the point in time we use for building prediction data
    index = 0
    start_term = 'meta name="dcterms.created" content="'
    index = find_str(html, start_term, index)
    created = html[index:].split('>')[0]
    created = created.lstrip(start_term).rstrip('"')
    meta_data['created'] = created
    
    # Set last modified data
    index = 0
    start_term = 'meta name="dcterms.modified" content="'
    index = find_str(html, start_term, index)
    modified = html[index:].split('>')[0]
    modified = modified.lstrip(start_term).rstrip('"')
    meta_data['modified'] = modified
    
    return meta_data
    
def tracklist_general_information(html):
    '''
    Extract general info about tracklist/set.
    
    '''
    info_doc = {}
    index = 0
    start_term = 'General Information'
    index = find_str(html, start_term, index)
    info_chunk = html[index:].split('Most Liked Tracklists')[0]
    
    # Genres -- can use these to build genre-specific graphs
    style_index = 0
    style_index = find_str(info_chunk, 'Tracklist Musicstyle', style_index)
    styles = info_chunk[style_index:].split('id="tl_music_styles">')[1].split('</td>')[0]
    styles = [style.strip(' ') for style in styles.split(',')]
    info_doc['styles'] = styles
    
    # If 1001 recognizes the dj who played the set they link their dj page
    # Its my understanding dj pages are independent of artist pages -- we'll need to map these
    index = 0
    start_term = 'a href="/dj'
    index = find_str(html, start_term, index)
    if index != -1:
        dj_url = html[index:].split('class')[0].split('"')[1]
        dj_url = 'https://www.1001tracklists.com' + dj_url
        info_doc['dj_url'] = dj_url

        dj_name = html[index:].split('</a>')[0].split('>')[1]
        info_doc['dj_name'] = dj_name
    else:
        info_doc['dj_url'] = 'N/A'
        info_doc['dj_name'] = 'N/A'
        
    return info_doc
    
def tracklist_track_data(html):
    '''
    Extract track related data from set
    '''
    track_docs = {}
    index = 0
    while find_str(html, 'tracknumber_value">', index) != -1:

        index = find_str(html, 'tracknumber_value">', index)
        #print(index)
        track_chunk = html[index:].split('<br>')[0]
        #print(track_chunk)
        
        # Extract track number
        track_num = track_chunk[:22].split('<')[0].strip('tracknumber_value">')
        #print('Track Number:', track_num)

        # Extract track information
        chunk_index = 0
        chunk_index = find_str(track_chunk, 'meta itemprop="name" content=', chunk_index)
        extracted_value = track_chunk[chunk_index:].strip('meta itemprop="name" content=').split('>')[0].strip('"')
        clean_string = fix_decoding_errors(extracted_value)
        #print(clean_string)
        
        if len(clean_string) > 1:
            try:
                artist_list, track, remixer = parse_track_and_artist(clean_string)
            except:
                artist_list, track, remixer = None, None, None
        else:
            artist_list, track, remixer = None, None, None
            
        # Avoid ID's for now
        if artist_list is None:
            pass
        # If track info pull failed then pass
        elif (('ID' in artist_list) or ('ID' in track)): 
            pass
        else:
            
            # Tends to be multiple artists so artists parsed to list even if only one
            for artist in artist_list:
                
                #print('Artist:',artist)
                #print('Track:', track)
                #print('Remixer:', remixer)

                # Extract artist page
                artist_index = 0
                artist_index = find_str(track_chunk, 'title="open artist page"', artist_index)
                if artist_index != -1:
                    artist_url = track_chunk[artist_index:].split('class')[1].split('href="')[1].rstrip('" ')
                    #print('Aritst url:', artist_url)
                    artist_url = 'https://www.1001tracklists.com' + artist_url
                else:
                    artist_url = 'N/A'

                # Extract remixer page (if exists)
                if remixer != 'N/A':
                    remixer_index = find_str(track_chunk, 'title="open remixer artist page"', artist_index)
                    if remixer_index != -1:
                        remixer_url = track_chunk[remixer_index:].split('class')[1].split('href="')[1].rstrip('" ')
                        #print('Remixer url:', remixer_url)
                        remixer_url = 'https://www.1001tracklists.com' + remixer_url
                    else:
                        remixer_url = 'N/A'
                else:
                    remixer_url = 'N/A'

                # Extract track page
                track_index = 0
                track_index = find_str(track_chunk, 'title="open track page"', artist_index)
                if track_index != -1:
                    track_url = track_chunk[track_index:].split('class')[1].split('href="')[1].split('"')[0]
                    #print('track url:', track_url)
                    track_url = 'https://www.1001tracklists.com' + track_url
                else:
                    track_url = 'N/A'

                track_doc = {\
                            'track_num': track_num,
                            'artist': artist.strip(' '),
                            'artist_url': artist_url.strip(' '),
                            'name': track.strip(' '),
                            'track_url': track_url.strip(' '),
                            'remixer': remixer.strip(' '),
                            'remixer_url': remixer_url.strip(' ')
                            }
                track_docs[track_num] = track_doc
                #print('\n\n\n')

    return track_docs

def build_artist_edges(url_doc, url):
    '''
    Build artist set-adjacency docs -- order n^2.
    Dont iterate over full set twice since will be considered non-directional
    '''
    all_tracks = []
    these_tracks = list(url_doc['track_docs'].values())
    for i in range(len(these_tracks)):
        for j in range(i,len(these_tracks)):

            track = these_tracks[i]
            other_track = these_tracks[j]

            first_artist = track['artist']
            second_artist = other_track['artist']

            if first_artist != second_artist:
                all_tracks.append(\
                                {
                                'artist1': first_artist,
                                'artist2': second_artist,
                                'url': url
                                }
                            )
    return all_tracks

def build_track_edges(track_docs, url):
    '''
    Build track set-adjacency docs -- order n^2.
    Dont iterate over full set twice since will be considered non-directional
    '''
    edge_docs = {}
    keys = sorted(list(track_docs.keys()))
    for i in range(len(keys)):
        for j in range(i, len(keys)):

            key = keys[i]
            other_key = keys[j]

            if key != other_key:
                _id = '_'.join([url,'_'.join(key),'_'.join(other_key)])
                edge_docs[_id] = \
                                    {
                                    #'_id': _id,
                                    'track1_name': track_docs[key]['name'],
                                    'track1_artist': track_docs[key]['artist'],
                                    'track1_remixer': track_docs[key]['remixer'],
                                    'track2_name': track_docs[other_key]['name'],
                                    'track2_artist': track_docs[key]['artist'],
                                    'track2_remixer': track_docs[key]['remixer'],
                                    'url': url
                                    }
    return edge_docs
                
def build_sequential_track_edges(track_docs, url):
    '''
    Allows for later "next track lookup" functionality

    '''
    enumerated_tracks = [(track_docs[key]['track_num'], track_docs[key])\
                             for key in list(track_docs.keys())]
    enumerated_tracks = sorted(enumerated_tracks, key=lambda x: x[0])
    
    seq_docs = {}
    for track_idx in range(len(enumerated_tracks)-1):
        _id = '_'.join(\
                      [\
                       url,\
                       '_'.join(enumerated_tracks[track_idx][0]),\
                       '_'.join(enumerated_tracks[track_idx+1][0])
                      ]
                    )
        seq_docs[_id] = \
                       {
                       #'_id': _id,
                       'url': url,
                       'track1_name': enumerated_tracks[track_idx][1]['name'],
                       'track1_artist': enumerated_tracks[track_idx][1]['artist'],
                       'track1_remixer': enumerated_tracks[track_idx][1]['remixer'],
                       'track2_name': enumerated_tracks[track_idx+1][1]['name'],
                       'track2_artist': enumerated_tracks[track_idx+1][1]['artist'],
                       'track2_remixer': enumerated_tracks[track_idx+1][1]['remixer'],
                       'first_position': str(enumerated_tracks[track_idx][0]),
                       'second_position': str(enumerated_tracks[track_idx+1][0]),   
                       }
    return seq_docs

def build_played_playedby_edge(url_doc, url):
    '''
    Allows you to map who plays who.
    I think it would be interesting to study directional graphs from this.
    
    '''
    dj_name = url_doc['dj_name']
    dj_url = url_doc['dj_url']
    
    if (dj_name == 'N/A') or (dj_url == 'N/A'):
        return []
    
    played_docs = []
    for track_doc in list(url_doc['track_docs'].values()):
        
        played_docs.append(\
                          {
                          'url': url,
                          'played_by': dj_name,
                          'played_by_url': dj_url,
                          'played': track_doc['name'],
                          'played_track_url': track_doc['track_url'],
                          'played_artist': track_doc['artist'],
                          'played_artist_url': track_doc['artist_url'],
                          'played_remixer': track_doc['remixer'],
                          'played_remixer_url': track_doc['remixer_url']
                          }
                        )
    return played_docs
       

### Spot check parser

In [13]:
# from urllib.request import Request, urlopen

# url = 'https://www.1001tracklists.com/tracklist/2rrcqmpk/modd-vulcan-gas-company-austin-united-states-2019-03-09.html'
# req = Request(url,\
#               headers={'User-Agent': 'Mozilla/5.0'})
# html = str(urlopen(req).read())

# url_doc = {}
# url_doc['html'] = html
# url_doc.update(tracklist_meta_data(html))
# url_doc.update(tracklist_general_information(html))
# track_docs = tracklist_track_data(html)
# url_doc['track_docs'] = track_docs

# track_edges = build_track_edges(track_docs, url)
# print('Length of track connections:', len(track_edges))
# sequential_edges = build_sequential_track_edges(track_docs, url)
# print('Length of Sequential connections:', len(sequential_edges))
# played_edges = build_played_playedby_edge(url_doc, url)
# print('Length of played connections:', len(played_edges))
# artist_edges = build_artist_edges(url_doc, url)
# print('Length of artist connections:', len(artist_edges))

Length of track connections: 91
Length of Sequential connections: 13
Length of played connections: 14
Length of artist connections: 85


### Run over these and keep a depth of like 2 for POC

### Iterate over newest setlists and grap urls

In [None]:
# import time

# def find_str(s, char, start_index=0):

#     index = 0
#     s = s[start_index+1:]
#     if char in s:
#         c = char[0]
#         for ch in s:
#             if ch == c:
#                 if s[index:index+len(char)] == char:
#                     return start_index + 1 + index
#             index += 1
#     return -1   

# def request(url):

#     user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
#     req = Request(url,\
#                   headers={'User-Agent': user_agent})
#     html = str(urlopen(req).read())
#     return html

# date_pages = ["https://www.1001tracklists.com/index%d.html?order=date" %d for d in range(100)]
# urls = []
# for url in date_pages:
    
#     # Make http request
#     html = request(url)
#     index = 0
#     # Iterate over links found in html
#     count = 0
#     while find_str(html, 'a href="', index) != -1:
        
#         # Extract url
#         index = find_str(html, 'a href="', index)
#         url_chunk = html[index:].split('"')[1]

#         # Make sure it is either a referenced tracklist or 1001 page
#         if ('/tracklist/' in url_chunk) and ('http' not in url_chunk):
#             new_page = 'https://www.1001tracklists.com' + url_chunk
#             urls.append(new_page)
#         if ('www.1001tracklists.com' in url_chunk) and ('.html' in url_chunk):
#             urls.append(url_chunk)
            
#         count += 1
            
#     print('Number of URLS:', len(urls))
#     time.sleep(10)

# import pickle
# with open('1001_urls.pkl', 'wb') as f:
#      pickle.dump(urls, f)

### Pickle them

In [11]:
# import numpy as np 
# import pickle

# with open('1001_urls.pkl', 'rb') as f:
#      urls = pickle.load(f)
        
# print(len(urls))
# unique_urls = np.unique(urls)
# print(len(unique_urls))
# unique_urls = [url for url in unique_urls if ('.html' in url) and ('#tlp' not in url)]
# print(len(unique_urls))

4534
2980
2970


### Iterate over found urls and parse

In [None]:
# import time
# import pickle
# import numpy as np

# def parse(url):
    
#     req = Request(url,\
#                   headers={'User-Agent': 'Mozilla/5.0'})
#     html = str(urlopen(req).read())   
    
#     url_doc = {}
#     url_doc['url'] = url
#     url_doc['html'] = html
#     url_doc.update(tracklist_meta_data(html))
#     url_doc.update(tracklist_general_information(html))

#     track_docs = tracklist_track_data(html)
#     url_doc['track_docs'] = track_docs

#     track_edges = build_track_edges(track_docs, url)
#     print('Length of track connections:', len(track_edges))
#     sequential_edges = build_sequential_track_edges(track_docs, url)
#     print('Length of Sequential connections:', len(sequential_edges))
#     played_edges = build_played_playedby_edge(url_doc, url)
#     print('Length of played connections:', len(played_edges))
#     artist_edges = build_artist_edges(url_doc, url)
#     print('Length of artist connections:', len(artist_edges))

#     return url_doc, track_edges, sequential_edges, played_edges, artist_edges, html

    
# played_docs = []
# sequential_docs = []
# track_docs = []
# tracklist_docs = []
# artist_docs = []
# url_html_map = {}

# with open('1001_urls.pkl', 'rb') as f:
#     urls = pickle.load(f)

# # Find unique urls
# seen_urls = []
# unique_urls = np.unique(urls)
# unique_urls = [url for url in unique_urls\
#                    if ('.html' in url) and ('#tlp' not in url) and (url not in seen_urls)]

# for url in unique_urls[:]:
    
#     if ('.html' in url) and ('#tlp' not in url) and (url not in seen_urls):
    
#         try:
            
#             url_doc, track_edges, sequential_edges, played_edges, artist_edges, html = parse(url)

#             url_html_map[url] = html
            
#             played_docs.extend(played_edges)
#             sequential_docs.extend(sequential_edges)
#             track_docs.extend(track_edges)
#             tracklist_docs.append(url_doc)
#             artist_docs.extend(artist_edges)

#             print('Len played docs:', len(played_docs))
#             print('Len sequential docs:', len(sequential_docs))
#             print('Len track docs:', len(track_docs))
#             print('Len tracklist docs:', len(tracklist_docs))
#             print('Len artist docs:', len(artist_docs))
            
#             with open('played_docs3.pkl', 'wb') as f:
#                 pickle.dump(played_docs, f)
#             with open('sequential_docs3.pkl', 'wb') as f:
#                 pickle.dump(sequential_docs, f)
#             with open('track_docs3.pkl', 'wb') as f:
#                 pickle.dump(track_docs, f)
#             with open('tracklist_docs3.pkl', 'wb') as f:
#                 pickle.dump(tracklist_docs, f)
#             with open('artist_docs3.pkl', 'wb') as f:
#                 pickle.dump(artist_docs, f)
#             with open('url_html_map.pkl', 'wb') as f:
#                 pickle.dump(artist_docs, f)
                
#             seen_urls.append(url)
            
#         except Exception as e:
#             print(e)
        
#         print('waiting')
#         time.sleep(10)

### Fix sequential and track docs

In [20]:
# import ftfy
# import pymongo
# from urllib.request import Request, urlopen

# class Parser:
    
#     def __init__(self):
        
#         myclient = pymongo.MongoClient("mongodb://localhost:27017/")
#         db = myclient['1001']
        
#         self.url_html_map = db['url_html_map']
#         self.tracklist_collection = db['tracklist_docs']
#         self.played_collection = db['played_docs']
#         self.track_collection = db['track_docs_fixed']
#         self.artist_collection = db['artist_docs']
#         self.sequential_collection = db['sequential_docs_fixed']
        
#         self.track_docs = []
#         self.played_docs = []
#         self.artist_docs = []
#         self.tracklist_docs = []
#         self.sequential_docs = []

#     def request(self, url):

#         req = Request(url,\
#                       headers={'User-Agent': 'Mozilla/5.0'})
#         html = str(urlopen(req).read())
#         return html
        
#     def find_str(self, s, char, start_index=0):
#         '''
#         Find substring char in string s. Found on internet, probably not efficient.

#         '''
#         index = 0
#         s = s[start_index+1:]
#         if char in s:
#             c = char[0]
#             for ch in s:
#                 if ch == c:
#                     if s[index:index+len(char)] == char:
#                         return start_index + 1 + index
#                 index += 1
#         return -1

#     def fix_decoding_errors(self, string):
#         '''
#         Fix UTF-8 decoding issues. Probably need to find more systematic/thorough approach to this.

#         REPLACE THIS WITH ftfy.fix_text() -- python package which should be one stop shop for fixes
#         '''
#         string = string.replace('&amp;','&')
#         string = string.replace('&#39;',"'")
#         string = string.replace('\\xc3\\xb6','o')
#         string = string.replace('\\xc3\\xab','e')
#         string = string.replace('\\xc3\\x9','u')
#         string = string.replace('\\xc3\\xb8','o')
#         string = string.replace("\\'","'")

#         return ftfy.fix_text(string)

#     def parse_track_and_artist(self, track_string):
#         '''
#         Extract the artist, track name, and remixer (if any) from the standard formatting used by 1001.

#         '''
#         # Check if Remix/Bootleg/Edit and parse accordingly
#         if ('Remix' in track_string) or ('Bootleg' in track_string) or ('Edit' in track_string):

#             artist, track_remixer = [string.strip(' ') for string in track_string.split(' - ')]
#             track_remixer = [string.strip(' ') for string in track_remixer.split('(')]

#             if len(track_remixer) > 2:
#                 track = track_remixer[0]
#                 remixer = '('.join(track_remixer[1:])
#             else:
#                 track, remixer = track_remixer
#                 remixer = remixer.rstrip('Remix)').strip(' ')

#         # If not remix, then should follow standard layout "Artist Name - Track Name"
#         # This layout is expressed explicitly in html
#         else:

#             artist, track = [string.strip(' ') for string in track_string.split(' - ')]
#             remixer = 'N/A'

#         # Check for multiple artists -- Big Room sets tend to have hella mashups
#         # Sometimes there is more structured formatting to exploit i.e. (Artist1 vs. Artist2 - Track1 vs. Track2)
#         # Not worrying about that now b/c big room sux
#         if 'vs.' in artist:
#             artist = artist.replace('vs.','&')
#         if '&' in artist:
#             artist = [a.strip(' ') for a in artist.split('&')]

#         # Remove features
#         # We could make features a separate field but for now just removing
#         if isinstance(artist, str):
#             if ('feat.' in artist) or ('ft.' in artist):
#                 artist = artist.split('feat.')[0].strip(' ')
#                 artist = artist.split('ft.')[0].strip(' ')
#         if isinstance(artist, list):
#                 artist = [a.split('feat.')[0].split('ft.')[0].strip(' ') for a in artist]

#         if isinstance(artist, list):
#             return (artist, track, remixer)
#         else:
#             return ([artist], track, remixer)


#     def tracklist_meta_data(self, html):
#         '''
#         Extract meta data about tracklist/set.

#         '''
#         meta_data = {}

#         # Extract set description
#         index = 0
#         start_term = 'meta name="description" content="'
#         index = self.find_str(html, start_term, index)
#         description = html[index:].split('>')[0]
#         description = description.lstrip(start_term).rstrip('"')
#         meta_data['description'] = description

#         # Set creation date
#         # Should probably be the point in time we use for building prediction data
#         index = 0
#         start_term = 'meta name="dcterms.created" content="'
#         index = self.find_str(html, start_term, index)
#         created = html[index:].split('>')[0]
#         created = created.lstrip(start_term).rstrip('"')
#         meta_data['created'] = created

#         # Set last modified data
#         index = 0
#         start_term = 'meta name="dcterms.modified" content="'
#         index = self.find_str(html, start_term, index)
#         modified = html[index:].split('>')[0]
#         modified = modified.lstrip(start_term).rstrip('"')
#         meta_data['modified'] = modified

#         return meta_data

#     def tracklist_general_information(self, html):
#         '''
#         Extract general info about tracklist/set.

#         '''
#         info_doc = {}
#         index = 0
#         start_term = 'General Information'
#         index = self.find_str(html, start_term, index)
#         info_chunk = html[index:].split('Most Liked Tracklists')[0]

#         # Genres -- can use these to build genre-specific graphs
#         style_index = 0
#         style_index = self.find_str(info_chunk, 'Tracklist Musicstyle', style_index)
#         styles = info_chunk[style_index:].split('id="tl_music_styles">')[1].split('</td>')[0]
#         styles = [style.strip(' ') for style in styles.split(',')]
#         info_doc['styles'] = styles

#         # If 1001 recognizes the dj who played the set they link their dj page
#         # Its my understanding dj pages are independent of artist pages -- we'll need to map these
#         index = 0
#         start_term = 'a href="/dj'
#         index = self.find_str(html, start_term, index)
#         if index != -1:
#             dj_url = html[index:].split('class')[0].split('"')[1]
#             dj_url = 'https://www.1001tracklists.com' + dj_url
#             info_doc['dj_url'] = dj_url

#             dj_name = html[index:].split('</a>')[0].split('>')[1]
#             info_doc['dj_name'] = dj_name
#         else:
#             info_doc['dj_url'] = 'N/A'
#             info_doc['dj_name'] = 'N/A'

#         return info_doc
        
#     def tracklist_track_data(self, html):

#         '''
#         Extract track related data from set
#         '''
#         track_docs = {}
#         index = 0
#         while self.find_str(html, 'tracknumber_value">', index) != -1:

#             index = self.find_str(html, 'tracknumber_value">', index)
#             #print(index)
#             track_chunk = html[index:].split('<br>')[0]

#             # Extract track number
#             track_num = track_chunk[:22].split('<')[0].strip('tracknumber_value">')
#             #print('Track Number:', track_num)

#             # Extract track information
#             chunk_index = 0
#             chunk_index = self.find_str(track_chunk, 'meta itemprop="name" content=', chunk_index)
#             extracted_value = track_chunk[chunk_index:].strip('meta itemprop="name" content=').split('>')[0].strip('"')
#             clean_string = self.fix_decoding_errors(extracted_value)
#             #print(clean_string)

#             if len(clean_string) > 1:
#                 try:
#                     artist_list, track, remixer = self.parse_track_and_artist(clean_string)
#                 except:
#                     artist_list, track, remixer = None, None, None
#             else:
#                 artist_list, track, remixer = None, None, None
#             #print(artist_list, track, remixer)
                
#             # Avoid ID's for now
#             if artist_list is None:
#                 pass
#             # If track info pull failed then pass
#             elif (('ID' in artist_list) or ('ID' in track)):
#                 pass
#             else:

#                 # Tends to be multiple artists so artists parsed to list even if only one
#                 for artist in artist_list:

#                     #print('Artist:',artist)
#                     #print('Track:', track)
#                     #print('Remixer:', remixer)

#                     # Extract artist page
#                     artist_index = 0
#                     artist_index = self.find_str(track_chunk, 'title="open artist page"', artist_index)
#                     if artist_index != -1:
#                         try:
#                             artist_url = track_chunk[artist_index:].split('class')[1].split('href="')[1].rstrip('" ')
#                             artist_url = 'https://www.1001tracklists.com' + artist_url
#                             #print('Aritst url:', artist_url)
#                         except:
#                             artist_url = 'N/A'
#                     else:
#                         artist_url = 'N/A'

#                     # Extract remixer page (if exists)
#                     if remixer != 'N/A':
#                         remixer_index = self.find_str(track_chunk, 'title="open remixer artist page"', artist_index)
#                         if remixer_index != -1:
#                             try:
#                                 remixer_url = track_chunk[remixer_index:].split('class')[1].split('href="')[1].rstrip('" ')
#                                 remixer_url = 'https://www.1001tracklists.com' + remixer_url
#                                 #print('Remixer url:', remixer_url)
#                             except:
#                                 remixer_url = 'N/A'
#                         else:
#                             remixer_url = 'N/A'
#                     else:
#                         remixer_url = 'N/A'
                    
#                     # Extract track page
#                     track_index = 0
#                     track_index = self.find_str(track_chunk, 'title="open track page"', artist_index)
#                     if track_index != -1:
#                         try:
#                             track_url = track_chunk[track_index:].split('class')[1].split('href="')[1].split('"')[0]
#                             track_url = 'https://www.1001tracklists.com' + track_url
#                             #print('track url:', track_url)
#                         except:
#                             track_url = 'N/A'
#                     else:
#                         track_url = 'N/A'

#                     track_doc = {\
#                                 'track_num': track_num,
#                                 'artist': artist.strip(' '),
#                                 'artist_url': artist_url.strip(' '),
#                                 'name': track.strip(' '),
#                                 'track_url': track_url.strip(' '),
#                                 'remixer': remixer.strip(' '),
#                                 'remixer_url': remixer_url.strip(' ')
#                                 }
#                     track_docs[track_num] = track_doc
#                     #print('\n\n\n')
        
#         #print(len(track_docs.keys()))
#         return track_docs

#     def build_artist_edges(self, url_doc, url):
#         '''
#         Build artist set-adjacency docs -- order n^2.
#         Dont iterate over full set twice since will be considered non-directional
#         '''
#         all_tracks = {}
#         count = 0
#         these_tracks = list(url_doc['track_docs'].values())
#         for i in range(len(these_tracks)):
#             for j in range(i,len(these_tracks)):

#                 track = these_tracks[i]
#                 other_track = these_tracks[j]

#                 first_artist = track['artist']
#                 second_artist = other_track['artist']

#                 if first_artist != second_artist:
#                     _id = '_'.join([url,first_artist,second_artist])
#                     all_tracks[_id] = \
#                                         {
#                                         #'_id': _id,
#                                         'artist1': first_artist,
#                                         'artist2': second_artist,
#                                         'url': url
#                                         }
#         return all_tracks

#     def build_track_edges(self, track_docs, url):
#         '''
#         Build track set-adjacency docs -- order n^2.
#         Dont iterate over full set twice since will be considered non-directional
#         '''
#         edge_docs = {}
#         keys = sorted(list(track_docs.keys()))
#         for i in range(len(keys)):
#             for j in range(i, len(keys)):

#                 key = keys[i]
#                 other_key = keys[j]

#                 if key != other_key:
#                     _id = '_'.join([url,'_'.join(key),'_'.join(other_key)])
#                     edge_docs[_id] = \
#                                         {
#                                         #'_id': _id,
#                                         'track1_name': track_docs[key]['name'],
#                                         'track1_artist': track_docs[key]['artist'],
#                                         'track1_remixer': track_docs[key]['remixer'],
#                                         'track2_name': track_docs[other_key]['name'],
#                                         'track2_artist': track_docs[key]['artist'],
#                                         'track2_remixer': track_docs[key]['remixer'],
#                                         'url': url
#                                         }
#         return edge_docs
                
#     def build_sequential_track_edges(self, track_docs, url):
#         '''
#         Allows for later "next track lookup" functionality

#         '''
#         enumerated_tracks = [(track_docs[key]['track_num'], track_docs[key])\
#                                  for key in list(track_docs.keys())]
#         enumerated_tracks = sorted(enumerated_tracks, key=lambda x: x[0])

#         seq_docs = {}
#         for track_idx in range(len(enumerated_tracks)-1):
#             _id = '_'.join(\
#                           [\
#                            url,\
#                            '_'.join(enumerated_tracks[track_idx][0]),\
#                            '_'.join(enumerated_tracks[track_idx+1][0])
#                           ]
#                         )
#             seq_docs[_id] = \
#                            {
#                            #'_id': _id,
#                            'url': url,
#                            'track1_name': enumerated_tracks[track_idx][1]['name'],
#                            'track1_artist': enumerated_tracks[track_idx][1]['artist'],
#                            'track1_remixer': enumerated_tracks[track_idx][1]['remixer'],
#                            'track2_name': enumerated_tracks[track_idx+1][1]['name'],
#                            'track2_artist': enumerated_tracks[track_idx+1][1]['artist'],
#                            'track2_remixer': enumerated_tracks[track_idx+1][1]['remixer'],
#                            'first_position': str(enumerated_tracks[track_idx][0]),
#                            'second_position': str(enumerated_tracks[track_idx+1][0]),   
#                            }
#         return seq_docs

#     def build_played_playedby_edge(self, url_doc, url):
#         '''
#         Allows you to map who plays who.
#         I think it would be interesting to study directional graphs from this.

#         '''
#         dj_name = url_doc['dj_name']
#         dj_url = url_doc['dj_url']

#         if (dj_name == 'N/A') or (dj_url == 'N/A'):
#             return {}

#         played_docs = {}
#         for track_doc in list(url_doc['track_docs'].values()):
#             _id = '_'.join([url,dj_name,track_doc['name']])
#             played_docs[_id] = \
#                               {
#                               #'_id': _id,
#                               'url': url,
#                               'played_by': dj_name,
#                               'played_by_url': dj_url,
#                               'played': track_doc['name'],
#                               'played_track_url': track_doc['track_url'],
#                               'played_artist': track_doc['artist'],
#                               'played_artist_url': track_doc['artist_url'],
#                               'played_remixer': track_doc['remixer'],
#                               'played_remixer_url': track_doc['remixer_url']
#                               }
#         return played_docs
    
    
#     def parse(self, url, html): 

#         url_doc = {}
#         url_doc['url'] = url
#         url_doc['html'] = html
#         self.url_html_map.insert_one(url_doc)
            
#         try:
            
#             url_doc.update(self.tracklist_meta_data(html))
#             url_doc.update(self.tracklist_general_information(html))

#             track_docs = self.tracklist_track_data(html)
#             url_doc['track_docs'] = track_docs

#             track_edges = self.build_track_edges(track_docs, url).values()
#             sequential_edges = self.build_sequential_track_edges(track_docs, url).values()
#             played_edges = self.build_played_playedby_edge(url_doc, url).values()
#             artist_edges = self.build_artist_edges(url_doc, url).values()

# #             self.tracklist_collection.insert_one(url_doc)
#             for doc in track_edges:
#                 self.track_collection.insert_one(doc)
# #             for doc in artist_edges:
# #                 self.artist_collection.insert_one(doc)
# #             for doc in played_edges:
# #                 self.played_collection.insert_one(doc)
#             for doc in sequential_edges:
#                 self.sequential_collection.insert_one(doc)

#             self.tracklist_docs.append(url_doc)
#             #self.played_docs.extend(played_edges)
#             self.track_docs.extend(track_edges)
#             #self.artist_docs.extend(artist_edges)        
#             self.sequential_docs.extend(sequential_edges)

#             print('Len tracklist docs:', len(self.tracklist_docs))
#             #print('Len played docs:', len(self.played_docs))
#             print('Len sequential docs:', len(self.sequential_docs))
#             print('Len track docs:', len(self.track_docs))
#             #print('Len artist docs:', len(self.artist_docs))
        
#         except Exception as e:
#             print(str(e))

# import pymongo

# myclient = pymongo.MongoClient("mongodb://localhost:27017/")

# mydb = myclient['1001']
# url_docs = list(mydb['url_html_map'].find({}))
# print(len(url_docs))

# prsr = Parser()
# for doc in url_docs[:]:
#     print(doc['url'])
#     if ('/tracklist/' in doc['url']) and ('http' in doc['url']) and ('#tlp' not in doc['url']):
#         prsr.parse(doc['url'], doc['html'])
    