In [70]:
# setup
from bs4 import BeautifulSoup
import requests
import re
import datetime
import pandas as pd
import time
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

In [71]:
# initialization

# time before scraping
t0 = time.time()

# insert number of days of concert data to scrape
numDays = 365

# create data frame to hold concert info
concerts = pd.DataFrame()

In [72]:
# function to scrape info from a given link on site
def scrape(link):
    #parse webpage with BeautifulSoup
    text = requests.get(link).text
    text = BeautifulSoup(text)
    
    # extract venue and genre(s) from <title> tag
    title = text.title.string
    title = title.split(' | ')
    venue = title[1]
    genre = title[2].split(', ')
    
    # get price
    price = text.find('div', 'price')
    if price.string != None:
        price = price.string.lower()
    else:
        price = ''
    dollarValues = re.findall('(\$\d+\.?\d?\d?)', price)
    
    # get time and date
    time_date = text.find('div', 'when').get_text()
    timeString = re.findall('(\d*:\d* .\.m\.)', time_date)
    if len(timeString) == 0:
        time = None
    else:
        time = timeString[0]

    # get min and max concert price depending on price & format
    if price.count('free') != 0:
        minprice = maxprice = 0.00
    elif len(dollarValues) == 0:
        minprice = maxprice = None
    else:
        for i in range(0, len(dollarValues)):
            dollarValues[i] = float(dollarValues[i].strip('$'))
        minprice = min(dollarValues)
        maxprice = max(dollarValues)
        
    # get neighborhood
    neighborhood = text.find('div', 'neighborhood').get_text()
    neighborhood = re.sub('(\\n\\n *)', '', neighborhood)
    neighborhood = re.sub('( *\\n)$', '', neighborhood)
    
    # get address info
    address = text.find('div', 'address').get_text()
    address = re.split('(?:\\xa0)?\\n +|\D+$', address)
    street = address[1]
    city = address[2]
    zipcode = address[3]
    phone = address[4]
    address_full = ', '.join([street, city, zipcode])
    
    # return list with relevant info
    return [venue, minprice, maxprice, neighborhood, address_full, time]

In [73]:
# get list of concert links from todays date in defined range
for j in range(0, numDays):
    # get string with date
    date = datetime.date.today() + datetime.timedelta(days = j) 
    dateString = date.strftime("%Y-%m-%d")
    
    # parse webpage with BeautifulSoup
    url = 'http://www.villagevoice.com/concerts?date=' + dateString
    text = requests.get(url).text
    text = BeautifulSoup(text)
    
    #get list of artists on page with links to more info
    artistTags = text.find_all('div', 'title')[1:]
    artistStrings = [tag.get_text() for tag in artistTags]
    artistLinks = [tag.a.attrs['href'] for tag in artistTags]
    for i in range(0, len(artistLinks)):
        artistLinks[i] = 'http://www.villagevoice.com' + artistLinks[i]
        
    # iterate through event links in page and add info as new row in data frame
    for i in range(0,len(artistStrings)):
        info = scrape(artistLinks[i])
        row = pd.DataFrame({'date': [dateString], 'artist': [artistStrings[i]], 
                        'venue': info[0], 'minprice': info[1],
                        'maxprice': info[2], 'neighborhood': info[3], 
                        'address': info[4], 'time': info[5]})
        concerts = pd.concat([concerts, row])

In [74]:
# rearrange columns and force proper indexing
cols = ['date', 'artist', 'venue', 'time', 'minprice', 'maxprice', 'neighborhood', 'address']
concerts = concerts[cols].reset_index(drop=True)

In [77]:
# time after scraping
t1 = time.time()

In [78]:
# see how long it took to scrape the data
t1 - t0

1611.720999956131

In [82]:
# save results to csv for analysis
concerts.to_csv('concerts.csv')