In [4]:
import requests
from bs4 import BeautifulSoup
from IPython.display import display, clear_output
import pandas as pd
import os
import re
import csv
import glob

In [31]:
# Dedication-scraping Funktion
def dedication_scraping(target_composer):

    #URL und directory bestimmen
    url = target_composer
    target_name = target_composer.split("Category:")[1].strip()
    csv_name = target_name.replace(".", "").replace("'", "")
    directory = 'imslp'
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    # HTML-Inhalt scrapen
    response = requests.get(url)
    html_content = response.text
    html_content_parsed = BeautifulSoup(html_content, 'html.parser')
    dedicatee_element = html_content_parsed.find('h3', string = re.compile(r'As Dedicatee .*'))

    if dedicatee_element:
        dedicatee_div = dedicatee_element.parent
        dedicatee_div = dedicatee_div.contents
        dedicatee_div_content = ''.join(str(item) for item in dedicatee_div)
    
    else:
        return 

    # HTML-Inhalt aufbereiten
    dedicatee_div_content = re.sub(r'<a.*?>', '', dedicatee_div_content)
    dedicatee_div_content = re.sub(r'</a>', '', dedicatee_div_content)
    
    content_pattern = r'<li>(.*?)<\/li>'
    dedications = re.findall(content_pattern, dedicatee_div_content)
    
    # Dedications-Tabelle erstellen
    work = []
    composer = []
    dedication_pattern = r'(.+)\s\((.+)\)'

    for element in dedications:
        match = re.match(dedication_pattern, element)
        if match:
            work.append(match.group(1))
            composer.append(match.group(2))

    dedications_data = pd.DataFrame({"work": work, "composer": composer})
    
    # Dedications-Tabelle aufbereiten
    dedications_final = dedications_data

    dedications_final.drop("work", axis=1, inplace=True)
    dedications_final.rename(columns={"composer": "source"}, inplace=True)
    dedications_final["target"] = target_name
    dedications_final["weight"] = dedications_final.groupby("source")["source"].transform("count")
    dedications_final.drop_duplicates(inplace=True)
    dedications_final['source'] = dedications_final['source'].replace(' ', '_', regex=True)

    
    dedications_final.to_csv("imslp/dedications_" + csv_name + ".csv", index=False)

In [32]:
# Dedication-list Funktion
def dedication_listing(folder_path):
    
    dedications_list = pd.DataFrame(columns=['source', 'target', 'weight', 'link'])
    
    # CSV_Inhalt zusammenfassen
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
    
            fast_save = pd.read_csv(file_path)
        
            dedications_list = pd.concat([dedications_list, fast_save], ignore_index=True)
            
    # Links-Spalte ergänzen
    dedications_list['link'] = 'https://imslp.org/wiki/Category:' + dedications_list['source']        
    dedications_list.to_csv("./dedications_list.csv", index=False)       

In [36]:
# Zusatz-Scraping Funktion
def further_scraping(dedications):
    
    dedication_list = pd.read_csv(dedications)
    dedication_list = dedication_list['link']
    dedication_list = dedication_list.drop_duplicates()
    
    for entry in dedication_list:
        dedication_scraping(entry)
    

In [37]:
# Nodes-List Funktion
def nodes(dedications):

    nodes_list = pd.read_csv(dedications)
    
    nodes_list = nodes_list.drop_duplicates(subset=['source'], keep='first')
    nodes_list = nodes_list[['source']].copy()
    nodes_list['Id'] = range(1, len(nodes_list) + 1)
    nodes_list.rename(columns={'source': 'Label'}, inplace=True)
    nodes_list = nodes_list.reindex(columns=['Id', 'Label'])
    nodes_list['Label'] = nodes_list['Label'].str.replace('_', ' ')
    
    nodes_list.to_csv("./nodes_list.csv", index=False)     

In [38]:
# Edges-List Funktion
def edges(nodes, dedications):
    
    edges_list = pd.read_csv(dedications)
    nodes_list = pd.read_csv(nodes)
    
    edges_list.drop('link', axis=1, inplace=True)
    edges_list['source'] = edges_list['source'].str.replace('_', ' ')
    edges_list['target'] = edges_list['target'].str.replace('_', ' ')
    
    edges_list['source'] = edges_list['source'].map(nodes_list.set_index('Label')['Id'])
    edges_list['target'] = edges_list['target'].map(nodes_list.set_index('Label')['Id'])
    
    edges_list.to_csv("./edges_list.csv", index=False) 

In [30]:
# Weitere Schritte

# Basis-Widmungen scrapen
    # dedication_scraping('https://imslp.org/wiki/Category:Brahms,_Johannes')
    # dedication_scraping('https://imslp.org/wiki/Category:Liszt,_Franz')

#- repeat x-times -#

# Widmungen zusammenfügen
    # dedication_listing('imslp')

# Weitere Widmungen scrapen
    # further_scraping('dedications_list.csv')

# Widmungen zusammenfügen
    # dedication_listing('imslp')

#- repeat x-times -#

# Nodes- und Edges-Listen erstellen
    # nodes('dedications_list.csv')
    # edges('nodes_list.csv', 'dedications_list.csv')