In [24]:
import os
import requests
import re
import time
from bs4 import BeautifulSoup

import pandas as pd

### Parses all compositions programmed in the Ensemble Modern season

In [26]:
# Before running one needs to reset initial_month to current one

root_path = 'C:/Users/Danusia/programming/tmp/'
directory = 'EM_events/'

path = os.path.join(root_path, directory)
os.mkdir(path)

# Sets up an 'explored' list containing the month we started with
initial_month = '/2022-09'
explored = [initial_month]

# Sets a string variable to hold next month's relative url
next_mo=''
i=0

while True:
    req = requests.get(''.join(['https://www.ensemble-modern.com/en/calendar', next_mo]))
    with open('{}event_EM_{}'.format(path, i), 'w', encoding='utf-8') as file:
        file.write(req.text)
    soup_tmp = BeautifulSoup(req.text, 'html5lib')
    
    # Finds a link to next month's calendar
    next_mo = (soup_tmp.find(class_="w__search--selection-cell w__calendar--table")
               .find_all('a', href=True, limit=2)[1]['href']
               .replace('/en/calendar', ''))
    if next_mo in explored:
        break
    i+=1
    time.sleep(3)

In [28]:
### Unocomment to run without the previous cell

# root_path = 'C:/Users/Danusia/programming/tmp/'
# directory = 'EM_events/'
# path = os.path.join(root_path, directory)


# List of dfs containing all works performed
works=[]
num_files = len(os.listdir(path))

for i in range(num_files):
    with open('{}event_EM_{}'.format(path, i), 'r', encoding='utf-8') as file:
        soup_tmp = BeautifulSoup(file.read(), 'html5lib')
    
    # List of dfs including works played on a given event
    works_conc=[]
    events = soup_tmp.find_all(class_="w__concert w__js-button")
    for event in events:
        pieces_cell = event.find(class_="w__concert--works-cell")
        df = pd.DataFrame(columns=['Composer', 'Work'])
        for i, piece in enumerate(pieces_cell.find_all('strong')):
            df.loc[i, 'Composer'] = piece.text
            df.loc[i, 'Work'] = piece.next_sibling.lstrip(': ')
        works_conc.append(df)
    
    # Df containing all works in a given month
    works_mo = pd.concat(works_conc)
    works.append(works_mo)

# Df containing all works performed
final_df = pd.concat(works)

# Extracting year of the composition to a new column
final_df['Year'] = final_df['Work'].str.extract('\((?P<Year>\d{4})\)')
final_df['Work'] = final_df['Work'].str.replace('\(\d{4}\)', '', regex=True)

# Extracting the info about newly commissioned pieces
final_df['Premiere'] = final_df['Work'].str.contains('\(World premiere\)').astype('int')
final_df['Work'] = final_df['Work'].str.replace('\(World premiere\)', '', regex=True)

# Drops duplicates after transformations
final_df = final_df.drop_duplicates(subset=['Composer', 'Work']).reset_index(drop=True)

In [32]:
final_df

Unnamed: 0,Composer,Work,Year,Premiere
0,Flo Mounier,Solo Piece,2022,0
1,Bernhard Gander,"OOZING EARTH for voice, extreme-metal drummer...",2019,0
2,Samuel Beckett,Quad,1981,0
3,John Cage,"Radio Music - for one to eight performers, eac...",1956,0
4,Heiner Goebbels,Toccata for Teapot and Piccolo - aus : Schwarz...,,0
...,...,...,...,...
91,Guillem Palomar,Volta,2022,0
92,Justė Janulytė,Sleeping patterns,2022,0
93,Jörg Widmann,Neues Werk 2022,2022,0
94,Jörg Widmann,Sphinxensprüche und Rätselkanons,2005,0


### Let's see whose pieces are going to be performed more than once by the end of the year

In [38]:
counts = final_df['Composer'].value_counts()
counts[counts>1]

Jörg Widmann             9
Heiner Goebbels          7
Wolfgang Rihm            3
John Cage                3
Hannah Kendall           2
Loïc Destremau           2
Tania León               2
Diego Ramos Rodriguez    2
Rebecca Saunders         2
Kathrin A. Denner        2
Justė Janulytė           2
Jong hoon Kim            2
Yitzhak Yedid            2
Jessie Cox               2
Pablo Garretón           2
Name: Composer, dtype: int64

### Let's now move on to Ensemble Intercontemporain website

First of all we download the content of all the pages containing events of interest

In [45]:
# Make sure to set desired location for saving files locally

req = requests.get('https://www.ensembleintercontemporain.com/en/calendar/')
soup_tmp = BeautifulSoup(req.text, 'html5lib')

events_block = soup_tmp.find(class_="grid events isotope blocks")
event_items = events_block.find_all(class_="item-content", href=True)
event_urls = [event['href'] for event in event_items]

# Initializes a list of soup objects
root_path = 'C:/Users/Danusia/programming/tmp/'
directory = 'EI_events/'
path = os.path.join(root_path, directory)

os.mkdir(path)

for i, event in enumerate(event_urls):
    time.sleep(3)
    req = requests.get(event)
    with open('{}event_EI_{}'.format(path, i), 'w', encoding="utf-8") as file:
        file.write(req.text)

In [3]:
### Unocomment to run without the previous cell

# root_path = 'C:/Users/Danusia/programming/tmp/'
# directory = 'EI_events/'
# path = os.path.join(root_path, directory)

works = []

# Iterate through files
for i in range(len(os.listdir(path))):
    with open('{}event_EI_{}'.format(path, i), 'r', encoding="utf-8") as file:
        soup_tmp = BeautifulSoup(file.read(), 'html5lib')
    event_panel = soup_tmp.find('aside')
    program = event_panel.find_all('section')[1]
    composers = program.find_all('p')

    # Ignore items that are not relevant while scraping works from current event
    df_ei = pd.DataFrame(columns=['Composer', 'Work'])
    for p in composers:
        if ('Tarifs' not in p.text) & ('Photo' not in p.text): 
            # These lists contain composer's name in the first item and pieces composed by him in subsequent ones
            works_comp = p.find_all('strong')
            for work in works_comp[1:]:
                df_ei.loc[len(df_ei)] = works_comp[0].text.strip(), work.text.strip()
    works.append(df_ei)

final_df_ei = (pd.concat(works)
               .drop_duplicates()
               .reset_index(drop=True))

In [None]:
# Something could be developped to create 'cast' and 'commission' columns. "Commande" as a keyword???

# for sib in work.next_siblings:
#                     cast=''
#                     if 'pour' in sib:
#                         cast=sib
#                         break

                        

In [23]:
final_df_ei.head()

Unnamed: 0,Composer,Work
0,Dieter AMMANN,BOOST
1,Dieter AMMANN,glut
2,Pierre BOULEZ,Messagesquisse
3,Rebecca SAUNDERS,Wound
4,Matthias PINTSCHER,skull


We can see some messy entries due to inconsistencies on the website. Let's clean them up manually.

In [17]:
def selection(row):
    return (final_df_ei['Composer'].value_counts()>1)[row['Composer']]
    
# This is the frame containing only composers that apear more than once
multiple = final_df_ei.loc[final_df_ei.apply(selection, axis=1)]
multiple

Unnamed: 0,Composer,Work
0,The Outcast,Hommage à Herman Melville
1,The Outcast,Musique d’Olga Neuwirth
2,The Outcast,Livret de Barry Gifford et Olga Neuwirth
3,The Outcast,monologues pour Old Melville d’Anna Mitgutsch
4,Dieter AMMANN,BOOST
...,...,...
103,Alban BERG,Suite lyrique
104,Arnold SCHÖNBERG,Pierrot lunaire
110,Kaija SAARIAHO,Lichtbogen
111,Anton WEBERN,Cinq Pièces


In [21]:
messy_entries = [0,1,2,3] + list(range(19,25)) + [27,41,42] + list(range(84,90))

final_df_ei = (final_df_ei
               .drop(messy_entries, axis=0)
               .reset_index(drop=True))

In [22]:
final_df_ei

Unnamed: 0,Composer,Work
0,Dieter AMMANN,BOOST
1,Dieter AMMANN,glut
2,Pierre BOULEZ,Messagesquisse
3,Rebecca SAUNDERS,Wound
4,Matthias PINTSCHER,skull
...,...,...
92,Anton WEBERN,Cinq Pièces
93,Gérard GRISEY,Quatre Chants pour franchir le seuil
94,Hèctor PARRA,Orgia
95,Edgard VARÈSE,Arcana
