# Scraping playaevents.burningman.org

In [484]:
import numpy  as np
import pandas as pd

import urllib3
urllib3.disable_warnings()

from bs4 import BeautifulSoup as bs

from tqdm import tqdm

## Scrape Event Links

In [478]:
year = 2017

http = urllib3.PoolManager()

links = {}
i = 1;

bad_request = False

while not bad_request:
    links_page = "https://playaevents.burningman.org/" + str(year) + "/playa_events/" + str(i)
    r = http.request('GET', links_page)
    soup = BeautifulSoup(r.data, "html.parser")
    
    if soup.prettify().startswith("Bad Request"):
        break
    
    for link in soup.find_all('a', attrs={'class': 'gold-flame'}):
        links[str(link.contents[0].replace('\n', '').lstrip().rstrip())] = link.get('href')
    
    print(i)
    i += 1;
    
print("Number of Events = " + str(len(links.keys())))

1
2
3
4
5
6
7
8
9
Number of Events = 3496


# Scrape Events from Links

In [471]:
# Some logic to identify the presence of event features

def process_feature(feature):
    feature_label_contents = feature.find_all('div', attrs={'class': 'col-xs-12'})
    
    if not feature_label_contents:    
        feature_label_contents = feature.find('div', attrs={'class': 'col-xs-4'})

        feature_label_contents_p = feature_label_contents.find('p')

        if feature_label_contents_p:
            name = feature_label_contents_p.contents[0][:-1]
        else:
            name = feature_label_contents.contents[0].replace('\n', '').lstrip().rstrip()[:-1]

        feature_contents   = feature.find('div', attrs={'class': 'col-xs-8'})
        feature_contents_a = feature_contents.find('a')
        feature_contents_p = feature_contents.find('p')

        if feature_contents_a:
            text = feature_contents_a.contents[0]
        elif feature_contents_p:
            text = feature_contents_p.contents[0]
        else:
            text = ' '.join([str(v) for v in feature_contents.contents]).replace('\n', '').replace('\t', '').replace('  ', '').lstrip()
    else:
        name = feature_label_contents[0].find('p').contents[0][:-1]
        text = feature_label_contents[1].find('p').contents[0]
            
    return name, text

In [479]:
data = pd.DataFrame()

for links_name in tqdm(links.keys()):
    links_page = "https://playaevents.burningman.org" + links[links_name]
    
    r = http.request('GET', links_page)

    soup = BeautifulSoup(r.data, "html.parser")
    
    features = soup.find('div', attrs={'class': 'event-display'}).find_all('div', attrs={'class': 'row'})

    feat_dict = {}
    
    feat_dict['Title'] = links_name

    for i in range(len(features)):
        feature_name, feature_text = process_feature(features[i])

        feat_dict[feature_name] = feature_text
        
    data = data.append(feat_dict, ignore_index=True)

100%|██████████████████████████████████████████████████████████████████████████████| 3496/3496 [06:07<00:00,  9.52it/s]


In [480]:
data.sample(10)

Unnamed: 0,Date and Time(s),Description,Hosted by Camp,Title,Type,Contact Email,Location,URL,Located at Art
1086,"Tuesday, August 29th, 2017 10 a.m. – 10:30 a.m...",We all need to know how to sew on a button sin...,Feed tHE ARTists,SEWING 101 with Martha Sartori,Class/Workshop,stephaniesartori@gmail.com,,,
891,"Monday, August 28th, 2017 6 p.m. – 11:45 p.m. ...",In search of a lil' boundary-pushing? Give the...,Lip Bomb,Wheel! Of! Misfortune!,Game,camplipbomb@gmail.com,,http://lipbomb.org/,
1515,"Tuesday, August 29th, 2017 4 p.m. – 5 p.m. <br/>",Environmental Campaigner and AEZ villager Aman...,Earth Guardians,Breaking Free from Fossil Fuels!,Class/Workshop,,lorax,http://earthguardians.net/,
352,"Monday, August 28th, 2017 10 a.m. – 11 a.m. <b...",Come learn how to hula hoop!! We'll teach you ...,Naked Rainbow,Flow Workshop: Hula Hoop,Class/Workshop,,Naked Rainbow,,
1488,"Tuesday, August 29th, 2017 4 p.m. – 5 p.m. <br/>",Join us for this talk about all things CBD oil...,Red Lightning,The Amazing Benefits of CBD Oil,Ritual/Ceremony,,,http://redlightning.org,
2962,"Friday, September 1st, 2017 9 a.m. – 10 a.m. <...",Stitch & Twitch: Beginning crochet class and ...,Hookers & Makers,Stitch & Twitch,Food,,elieb@mountaindevelopment.com,,
2851,"Thursday, August 31st, 2017 7 p.m. – 8 p.m. <b...",Have you ever considered becoming a Black Rock...,Ranger Headquarters,Learn About the Black Rock Rangers,Other,,Ranger Headquarters,http://rangers.burningman.org/,
600,"Monday, August 28th, 2017 1 p.m. – 2 p.m. <br/...",Come hear smarty-pants bibliophile Burners sha...,Black Rock Public Library,Literary Salon,Other,blackrockpubliclibrary@gmail.com,,https://blackrockpubliclibrary.org/events,
2364,"Wednesday, August 30th, 2017 9 p.m. – 11:45 p....",Re-live the Paris of the 1900's. French Cancan...,FAFA Camp,Bal guinguette,Gathering/Party,,,,
2145,"Wednesday, August 30th, 2017 4 p.m. – 6 p.m. <...",Hello folks! Arlequin and Pulcinella need you!...,,Improvisation theatre games,Game,,ruggerotartaro@yahoo.it,,


In [481]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3496 entries, 0 to 3495
Data columns (total 9 columns):
Date and Time(s)    3496 non-null object
Description         3496 non-null object
Hosted by Camp      3258 non-null object
Title               3496 non-null object
Type                3496 non-null object
Contact Email       878 non-null object
Location            1342 non-null object
URL                 1144 non-null object
Located at Art      56 non-null object
dtypes: object(9)
memory usage: 245.9+ KB


In [482]:
data.describe()

Unnamed: 0,Date and Time(s),Description,Hosted by Camp,Title,Type,Contact Email,Location,URL,Located at Art
count,3496,3496,3258,3496,3496,878,1342,1144,56
unique,1750,3464,585,3496,12,353,576,381,32
top,"Wednesday, August 30th, 2017 1 p.m. – 2 p.m. <...",The PlayaPops is glad to offer its fourth seas...,Naked Heart,Breakfast & Beats Under the Sheets,Class/Workshop,sleepless@burningman.org,nakedheart,http://www.hexcollective.org,Step Forward
freq,23,5,86,1,1158,46,42,35,7


In [483]:
data.to_csv('raw_data/raw_data_' + str(year) + '.csv', index=False)