## Populate an RDF database

This notebook reports the main steps to download CSV files, process them and create an RDF dataset from them accordingly to an ontology. 

To measure execution time in Jupyter notebooks: <code>pip install ipython-autotime</code>

In [None]:
import pandas as pd
import os
import ast
import unicodedata
import hashlib
import re
from pathlib import Path
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.parent.absolute())
print(path)
grammyUrl = path + '/csv/the_grammy_awards_mapped_uppercase.csv'
print(grammyUrl)
albumsUrl = path + '/csv/musicoset_metadata/albums.csv'
print(albumsUrl)
songsUrl = path + '/csv/musicoset_metadata/albums.csv'
print(songsUrl)
artistsUrl = path + '/csv/musicoset_metadata/albums.csv'
print(artistsUrl)
tracksUrl = path + '/csv/musicoset_metadata/tracks.csv'
print(tracksUrl)
songInChartUrl = path + '/csv/musicoset_popularity/song_chart.csv'
print(songInChartUrl)

# saving folder
savePath =  path + '/PopulateRDFdb/PopulateGrammyCategories/'

## Grammy & candidates/winners (Songs, Artists, Albums)

In [None]:
# Load the CSV files in memory
grammy = pd.read_csv(grammyUrl, sep=',', keep_default_na=False, na_values=['_'])

In [None]:
album = pd.read_csv(albumsUrl, sep='\t', index_col='album_id', keep_default_na=False, na_values=['_'])
album.info()
# List used to save the triple GrammyID, album and isWinner.
# It contains the specific grammy and the corresponding winner/candidated album
matched_pairs_grammy_album = []

In [None]:
grammy.info()

We need to install <code>RDFLib</code>

<code>pip3 install rdflib </code> [Documentation](https://rdflib.readthedocs.io/en/stable/gettingstarted.html)

In [21]:
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD, SKOS, RDFS


In [22]:
# Construct the country and the movie ontology namespaces not known by RDFlib
ME = Namespace("http://www.dei.unipd.it/~gdb/ontology/melody#")

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("xsd", XSD)
g.bind("mel", ME)
g.bind("skos", SKOS)
g.bind("rdfs", RDFS)

In [None]:
def create_grammy_id(year, category):
    
    year = str(year)
    category = str(category) if pd.notna(category) else ''

    # Data cleaning and normalization
    def clean_text(text):
        return re.sub(r'[^\w\s-]', '', text).lower().strip()
    
    # Create a concatenated string with all the data
    full_string = f"{year}_{clean_text(category)}"
    
    # Generate a truncated SHA-256 hash
    hash_object = hashlib.sha256(full_string.encode())
    short_hash = hash_object.hexdigest()[:8]
    
    # Create the final ID
    category_abbr = ''.join(word[0] for word in clean_text(category).split()[:3])
    final_id = f"{year}_{category_abbr}_{short_hash}"
    
    return final_id


def normalize_uri(name):
    name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
    name = name.replace(" ", "-")
    name = name.replace(",", "").replace("'", "")
    name = re.sub(r'&(\w)', lambda match: "&" + match.group(1).upper(), name)
    name = name.replace("&", "n")
    name = re.sub(r'/(\w)', lambda match: "/" + match.group(1).upper(), name)
    name = name.replace("/", "")

    return name

def string_to_bool(s):
    if isinstance(s, bool):
        return s
    return s.lower() == "true"

## Add Grammy individuals - Generate list containing albums that have won a grammy

In [None]:
%%time 
#measure execution time


# Dictionary to track Grammy IDs for unique year-category combinations
grammy_id_lookup = {}

#iterate over the grammy dataframe
for index, row in grammy.iterrows():

    grammy_id = create_grammy_id(
            row['year'],       
            row['category']
        )


    # Create the node to add to the Graph
    # the node has the namespace + the grammy_id as URI
    current_grammy = URIRef(ME[grammy_id])

    if False:
        print(row)
        print(grammy_id) #1959_a_a98ae627
        print(ME.Grammy) # http://www.dei.unipd.it/~gdb/ontology/melody/Grammy
        print(current_grammy) # http://www.dei.unipd.it/~gdb/ontology/melody/1959_a_a98ae627

    # Add Grammy, link it to SKOS category and to the corresponding dataPropery Year
    g.add((current_grammy, RDF.type, ME.Grammy))
    g.add((current_grammy, ME['hasCategory'], Literal(normalize_uri(row['category']))))
    g.add((current_grammy, ME['year'], Literal(row['year'], datatype=XSD.gYear)))

    # If the grammy as "album" in the category title
    if "album" in row['category'].lower():

        # Extracts the names of the artists from album['artists'].
        # Converts string to dictionary and takes the value
        album['artist_name'] = album['artists'].apply(lambda x: list(eval(x).values())[0]) 
         
        isWinner = string_to_bool(row['winner'])

        # Clean row['workers'] and get a list of names
        worker_names = re.sub(r'\([^)]*\)', '', row['workers'])  # Remove content between brackets
        worker_names = [name.strip() for name in worker_names.split(',')] # Split workers

        # Filter albums that match by title and (at least one) artist
        # So, matched_album contains only the albums that have won/nominated for a grammy, we resolve cases of homonymy by filtering by artist name in both grammy and artists
        matched_album = album[
            (album['billboard'].str.lower() == row['nominee'].lower().rstrip('.,').strip()) & 
            (album['artist_name'].str.lower().isin([name.lower() for name in worker_names]))
        ]

        if not matched_album.empty:
            #print(matched_album)
            for album_row in matched_album.itertuples(index=True):
                print(grammy_id, album_row.Index, isWinner)
                matched_pairs_grammy_album.append((grammy_id, album_row, isWinner))


 




In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'grammy.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))


# Referential integrity
Note that in RDF we are in an open world situation. We cannot guarantee the referential integrity between the entities. 

## Album

Let us generate the RDF data relative to the movie workers.

In [None]:
albums = pd.read_csv(albumsUrl, sep='\t', index_col='album_id', keep_default_na=False, na_values=['_'])
albums.info()

In [None]:
tracks = pd.read_csv(tracksUrl, sep='\t', index_col='album_id', keep_default_na=False, na_values=['_'])
tracks.info()

People are modeled with the FOAF ontology. 
Refer to [FOAF Documentation](http://xmlns.com/foaf/spec/)

In [None]:
songInChart = pd.read_csv(songInChartUrl, sep='\t', index_col='song_id', keep_default_na=False, na_values=['_'])
songInChart.info()
songInChart.index

In [29]:
#create a new graph
g = Graph()
ME = Namespace("http://www.dei.unipd.it/~gdb/ontology/melody#")
g.bind("xsd", XSD)
g.bind("mel", ME)
g.bind("skos", SKOS)
g.bind("rdfs", RDFS)

# Link song to Album

In [None]:
%%time 
#measure execution time

#iterate over the album dataframe
for index, row in albums.iterrows():
    # Create the node to add to the Graph

    if False:
        print(row)
        print(index) #5n1GSzC1Reao29ScnpLYqp
        print(ME.Album) # http://www.dei.unipd.it/~gdb/ontology/melody/Album
        print(current_album) # http://www.dei.unipd.it/~gdb/ontology/melody/5n1GSzC1Reao29ScnpLYqp

    # Rows obtained from tracks that contain all the songs of the current album (row)
    album_tracks = tracks[tracks.index == index]
    #print(album_tracks['song_id'])
    #print(songInChart.index)
    #break
    
    if not album_tracks.empty:
            # Iterate through all tracks for this album
            for _, track in album_tracks.iterrows():
                # If the song on the album is among the top 100 songs
                match = songInChart.index.str.contains(track['song_id']).any()
                if match:
                    #print(track['song_id'])
                    song_id = track['song_id']
                    current_song = URIRef(ME[song_id])
                    current_album = URIRef(ME[index])
                    g.add((current_album, RDF.type, ME.Album))
                    g.add((current_album, ME['name'], Literal(row['name'], datatype=XSD.string)))
                    g.add((current_album, ME['containsSong'], current_song))
                    if row['total_tracks'] is not None and isinstance(row['total_tracks'], int):
                        g.add((current_album, ME['totalTracks'], Literal(row['total_tracks'], datatype=XSD.positiveInteger)))
                    else:
                        g.add((current_album, ME['totalTracks'], Literal(0, datatype=XSD.positiveInteger)))
                    print(f"\nAlbum {index} contains song {song_id}\nSo we add Album {row['name']} ")

                    # Add link and artist
                    name_dict = eval(row.artists)
                    for artist_id in name_dict.keys():
                        current_artist = URIRef(ME[artist_id])
                        g.add((current_album, ME['isReleasedBy'], current_artist))
                        print(f"Artist ID: {artist_id}")

                    # Add winner/candidated to grammy winning albums
                    for grammy_id, album_row, winner in matched_pairs_grammy_album:
                        if album_row.Index == index:
                            album_id = album_row.Index
                            current_album = URIRef(ME[album_id])
                            current_grammy = URIRef(ME[grammy_id])


                            if (winner):
                                g.add((current_album, ME['winner'], current_grammy))
                                print(f"{album_id} won {grammy_id}")
                            else:
                                g.add((current_album, ME['candidated'], current_grammy))
                                print(f"{album_id} lost {grammy_id}")

                else:
                    print(f"\nAlbum {index} does not contain songs fromn Top 100")
    else:
        print(f"\nNo tracks found for album {index}")

 





    
    
    

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'albums.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))