## Populate an RDF database

This notebook reports the main steps to download CSV files, process them and create an RDF dataset from them accordingly to an ontology. 

To measure execution time in Jupyter notebooks: <code>pip install ipython-autotime</code>

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS
from rdflib.namespace import XSD, SKOS
# libraries for matching strings 
from fuzzywuzzy import fuzz, process



In [2]:
# melody ontology namespace
MEL = Namespace("http://www.dei.unipd.it/~gdb/ontology/melody#")
print(MEL)

# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.parent.absolute())
print(path)

# saving folder
savePath =  path + '/PopulateRDFdb/PopulateArtists/'

http://www.dei.unipd.it/~gdb/ontology/melody#
c:\Users\Manuel\Documents\università\magistrale\primo-semestre\graph-databases\MELODY


In [3]:
import unicodedata
import re
import hashlib

def normalize_uri_genres(name):
	name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
	name = name.replace(" ", "")
	# Remove commas and apostrophes
	name = name.replace(",", "").replace("'", "")
	# Replace & with n
	name = name.replace("&", "n")
	# Replace +
	name = name.replace("+", "")
	# Particular case for the Genre '-'
	if name == '-':
			name = 'NotAvailable'

	return name

def normalize_uri_grammy(name):
	# Rimuove accenti e caratteri speciali
	name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
	name = name.replace(" ", "-")
	name = name.replace(",", "").replace("'", "")
	name = re.sub(r'&(\w)', lambda match: "&" + match.group(1).upper(), name)
	name = name.replace("&", "n")
	name = re.sub(r'/(\w)', lambda match: "/" + match.group(1).upper(), name)
	name = name.replace("/", "")

	return name

def normalize_artist_name(name):
	name = normalize_uri_genres(name)
	name = name.replace('*', '')
	name = name.replace('?', '')
	name = name.replace('!', '')

	return name

  
def create_grammy_id(year, category):
    
    year = str(year)
    category = str(category) if pd.notna(category) else ''

    # Data cleaning and normalization
    def clean_text(text):
        # Rimuove caratteri speciali e converte in lowercase
        return re.sub(r'[^\w\s-]', '', text).lower().strip()
    
    # Create a concatenated string with all the data
    full_string = f"{year}_{clean_text(category)}"
    
    # Generate a truncated SHA-256 hash
    hash_object = hashlib.sha256(full_string.encode())
    short_hash = hash_object.hexdigest()[:8]
    
    # Create the final ID
    category_abbr = ''.join(word[0] for word in clean_text(category).split()[:3])
    final_id = f"{year}_{category_abbr}_{short_hash}"
    
    return final_id

## Artists

In [4]:
# Load the CSV files in memory
artistsURL = path + '/csv/musicoset_metadata/artists.csv'
artists = pd.read_csv(artistsURL, sep='\t', index_col='artist_id')
print(artists.info())

<class 'pandas.core.frame.DataFrame'>
Index: 11518 entries, 66CXWjxzNUsdJxJ2JdwvnR to 45d3pteh2TnzUMMl27J4MY
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         11518 non-null  object 
 1   followers    11516 non-null  float64
 2   popularity   11518 non-null  int64  
 3   artist_type  11518 non-null  object 
 4   main_genre   11518 non-null  object 
 5   genres       11518 non-null  object 
 6   image_url    11518 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 719.9+ KB
None


In [5]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("mel", MEL)

g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("rdfs", RDFS)

In [6]:
%%time

import math

artistNames = {}
#measure execution time

for index, row in artists.iterrows():
  # Create artist node
  Artist = URIRef(MEL[index])
  # Add triples to the graph
  g.add((Artist, RDF.type, MEL.Artist))
  # Set artist name
  g.add((Artist, MEL['name'], Literal(row['name'], datatype=XSD.string)))
  artistNames[index] = row['name']
  # Set artist followers
  if not math.isnan(row['followers']):
    # print(row['followers'])
    g.add((Artist, MEL['followers'], Literal(int(row['followers']), datatype=XSD.nonNegativeInteger)))
  # Set artist genre
  genre = normalize_uri_genres(row['main_genre'])
  ArtistGenre = URIRef(MEL[genre])
  # No need to define the type because they are defined inside genres.ttl
  g.add((Artist, MEL['hasGenre'], ArtistGenre))
  # Set artist type
  if row['artist_type'] != '-':
    g.add((Artist, MEL['artistType'], Literal(row['artist_type'].strip().replace('\'', ''), lang="en")))
  # Compute and set artist popularity
  if row['popularity'] >= 67:
    Popularity = URIRef(MEL['High'])
  elif row['popularity'] >= 34:
    Popularity = URIRef(MEL['Medium'])
  else:
    Popularity = URIRef(MEL['Low'])
  g.add((Artist, MEL['hasPopularity'], Popularity))

print(artistNames)

CPU times: total: 219 ms
Wall time: 1.46 s


In [7]:
print(artistNames)



In [8]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'artists.ttl', 'w', encoding='utf-8') as file:
  file.write(g.serialize(format='turtle'))


--- saving serialization ---
CPU times: total: 406 ms
Wall time: 1.05 s


## Artists-Grammy

In [9]:
# Load csv file about grammy awards
grammyURL = path + '/csv/the_grammy_awards_mapped_uppercase.csv'
grammy = pd.read_csv(grammyURL, sep=',', keep_default_na=False, na_values=['_'])
print(grammy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      6323 non-null   int64 
 1   category  6323 non-null   object
 2   nominee   6323 non-null   object
 3   workers   6323 non-null   object
 4   winner    6323 non-null   bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 203.9+ KB
None


In [10]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("mel", MEL)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("rdfs", RDFS)

Idea of the matching between Artist and Grammy Award:
- Grammy award contains the keyword 'Artist'
- Grammy award contains the keyword 'Producer'
- Grammy award contains the keyword 'Performance', nominee contains the name of the artist and workers is None

In [11]:
# There are some grammy about the artist in which the nominee is the song and the worker is the artist

dict = {}

for index, row in grammy.iterrows():
	candidate = ''
	isProducerMatch = None
	isPerformanceMatch = None
	isArtistMatch = re.search('artist', row['category'].lower())
	if isArtistMatch != None:
		# print('isArtistMatch match')
		if row['workers'] != '':
			candidate = row['workers'].replace('(artist)', '').replace('(producer)', '').strip()
		else:
			candidate = row['nominee'].replace('(artist)', '').replace('(producer)', '').strip()
	else:
		isPerformanceMatch = re.search('performance', row['category'].lower())
		if isPerformanceMatch != None:
			# print('isPerformanceMatch match')
			if row['workers'] == '':
				# print('isPerformanceMatch match with no workers')
				candidate = row['nominee'].replace('(artist)', '').replace('(producer)', '').strip()
		else:
			isProducerMatch = re.search('producer', row['category'].lower())
			if isProducerMatch != None:
				# print('isProducerMatch match')
				if row['workers'] != '':
					candidate = row['workers'].replace('(artist)', '').replace('(producer)', '').strip()
				else:
					candidate = row['nominee'].replace('(artist)', '').replace('(producer)', '').strip()

	if candidate != '':
		# print('candidate is ', candidate)
		dict[candidate] = []
		pos = 0
		for artistKey, artistName in artistNames.items():
			pos = pos + 1
			if fuzz.partial_ratio(candidate, artistName) == 100:
				dict[candidate].append((artistKey, artistName))
				# print(candidate, ' SIMILAR TO ', artistName, ' at position ', pos)
		
		matchedArtistNames = [value for _, value in dict[candidate]]
		bestMatch = process.extractOne(candidate, matchedArtistNames, score_cutoff=1)
		# print('best match ', bestMatch)
		if bestMatch is not None and bestMatch[0] == candidate:
			# print('candidate ', candidate, ' is the subject of the grammy')
			Grammy = URIRef(MEL[create_grammy_id(
				row['year'], 
				row['category'])]
			)

			bestMatchKey = dict[candidate][matchedArtistNames.index(bestMatch[0])][0]
			# print('key of the best match is ', dict[candidate][matchedArtistNames.index(bestMatch[0])][0])
			# print('name of the best match is ', dict[candidate][matchedArtistNames.index(bestMatch[0])][1])

			if row['winner']:
				g.add((URIRef(MEL[bestMatchKey]), MEL['winner'], Grammy))
			else:
				g.add((URIRef(MEL[bestMatchKey]), MEL['candidated'], Grammy))

In [12]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'artists-grammy.ttl', 'w', encoding='utf-8') as file:
  file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 4.3 ms
