In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import utilities
from selenium import webdriver
from datetime import date, timedelta

## Scraping initial steps Strategy

1. Extract the HTML from the website via Selenium
2. Soupify (process) the HTML via BeautifulSoup
3. Determine what HTML tags are in the soup and process them into a set
4. Get the corresponding classes to the HTML tags and check with the website which sections are important
    - At this step we need the stage, artist and set times
    - These can be found in the HTML as `spans` and either have an `itemprop` or `class` attribute
5. Process the text from the corresponding tags
6. Translate the data into a pandas dataframe where: 
    - Each row has the start time, end time, venue and artist name
    - `Note: this website is helpful as the order in which the festival sets are scraped is chronological.`
7. Output dataframe as CSV files

# Soupify the URL

In [None]:
# sunday
timetable_url = 'https://partyflock.nl/party/445629:Paaspop'
# saturday
timetable_url = 'https://partyflock.nl/party/445628:Paaspop'
# friday
timetable_url = 'https://partyflock.nl/party/429538:Paaspop'

driver = webdriver.Chrome()
driver.maximize_window()
driver.get(timetable_url)

time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content, "html.parser")
driver.quit()

# Create element subset with attr `itemprop` or `class`

From the soupified results, get all the span elements.
Then from those, get the text of the elements that contain itemprop or class.

In [None]:
element_list = []

for i in soup.find_all('span'):
    if i.get_text() == '':
        continue
    if i.has_attr("itemprop"):
        # This ` category:artist` is used later for scraping artist name.
        element_list.append(i.get_text() + ' category:artist')
    if i.has_attr("class"): 
        element_list.append(i.get_text())

### Getting stages

The `element_list` is structured something like:
```
[
"1",
"details_",
"details_",
"details_",
"details_",
"2",
"details_",
"details_",
"details_",
"details_",
"3",
....
]
```

The entries in `element_list` that are just a number, map directly to the stages from the HTML. 

This is useful to separating out the soupified results to parse data per stage.

In [None]:
NO_STAGES = 17 # 16 stages + 1 for looping
stage_indices = []
for i in range(1, NO_STAGES):
    stage_indices.append(element_list.index(f'{i}'))

In [None]:
# so far so good but these are the indices of the numbers not the stages
start_stage_indices = [x + 1 for x in stage_indices]
# i can take the last index of the list
# but i know the last entries of the list are junk
end_string_identifier = '@ 12 augustus 2022'
end_string_index = element_list.index(end_string_identifier)
end_stage_indices = [x+1 for x in stage_indices[1:]]
end_stage_indices.append(end_string_index)

In [None]:
# If start and end stage lens are different, can't create start_stops
assert len(start_stage_indices) == len(end_stage_indices)
start_stops = list(zip(start_stage_indices, end_stage_indices))

In [None]:
# * is needed in order to unpack each inner tuple into
# its constituent integer arguments 
# and pass each individually into slice
sliced_list_of_sets = [element_list[slice(*s)] for s in start_stops]

## Can now map stages to performer data

The sliced_list_of_sets is a list of lists. 

Each element in the list corresponds to a separate stage. The next step is processing that.

In [None]:
festival_set_dict = {}
for item in sliced_list_of_sets:
    stage_name = item[0]
    stage_performer_data = item[1:]
    festival_set_dict[stage_name] = stage_performer_data

In [None]:
festival_date = date(2022, 4, 7)

stage_datetime_map = {}
festival_set_dict = utilities.get_performer_data_per_stage(sliced_list_of_sets)
for stage, stage_list in festival_set_dict.items():
    time_stamp_indices = utilities.get_timestamp_indices(stage_list)
    time_stamp_artist_map = utilities.map_timestamp_to_artist(time_stamp_indices, stage_list)
    substring_to_clean = "category:artist"
    cleaned_time_stamp_artist_map = utilities.clean_artist_substring_tags(time_stamp_artist_map, substring_to_clean)
    datetime_artist_map = utilities.map_to_datetime_keys(cleaned_time_stamp_artist_map, festival_date)
    stage_datetime_map[stage] = datetime_artist_map

In [None]:
festival_df = pd.DataFrame()
for stage, setlist in stage_datetime_map.items():
    setlist = utilities.unpack_festival_datetime_artist_dict(setlist, stage)
    setlist_df = pd.DataFrame.from_dict(setlist)
    festival_df = pd.concat([festival_df, setlist_df], axis = 0)

In [None]:
entries = festival_df["start"] > festival_df["end"]
festival_df.loc[entries, "end"] = festival_df.loc[entries, "end"].apply(lambda end: end + timedelta(days=1))

In [None]:
out = festival_df.to_json(orient='records')
with open("paaspop_friday.json", "w") as outfile:
    outfile.write(out)

festival_df.to_csv('paaspop_friday.csv')
festival_df.to_pickle('paaspop_friday.pkl')