In [None]:
# Imports

import os
import datetime
from pathlib import Path
import pandas as pd
import json
import requests

# Setup

In [None]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Setup datasets paths
artistsPath = os.path.join(datasetsPath, "artists.csv")
peoplePath = os.path.join(datasetsPath, "people.csv")


# Setup WikiData API

In [None]:
# Query to WikiData to get a list of search results. Ideally the first result should be the one we are looking for
def wdQuery(query):
    queryApiRequest = requests.get(
        "https://www.wikidata.org/w/api.php?action=query&format=json&list=search&srsearch={q}&srlimit=20".format(q=query.replace(" & ", " and ")))
    
    if queryApiRequest.status_code == 200:
        return json.loads(queryApiRequest.text)
    
    return None

# Query to WikiData to get the entity information from the ID retrieved from the query results
def wdGetEntity(entityID):
    queryApiRequest = requests.get(
        "https://www.wikidata.org/w/api.php?action=wbgetentities&ids={eID}&format=json".format(eID=entityID))

    if queryApiRequest.status_code == 200:
        return json.loads(queryApiRequest.text)["entities"][entityID]

    return None

def wdGetProperty(entityID, propertyID):
    queryApiRequest = requests.get(
        "https://www.wikidata.org/w/api.php?action=wbgetclaims&entity={eID}&property={pID}&format=json".format(eID=entityID, pID=propertyID))

    if queryApiRequest.status_code == 200:
        try:
            return json.loads(queryApiRequest.text)["claims"][propertyID]
        except:
            return None

    return None


# Data Retrieval

In [None]:
# Setup DataFrames

peopleDF = None
peopleCols = ["name", "surname", "birthdate", "deathdate", "nationality", "spotify_artist"]

In [None]:
# Load the CSV files in memory
artists = pd.read_csv(artistsPath, sep=",", index_col="id")

In [None]:
def getDateFromInformation(dateTime, dataFormat="%Y-%m-%dT%H:%M:%SZ"):
    return datetime.datetime.strptime(
        dateTime, dataFormat
    ).strftime('%Y-%m-%d')


In [None]:
wdMap = {
    "instanceOf": "P31",
    "name": "P735",
    "birthName": "P1477",
    "surname": "P734",
    "birthDate": "P569",
    "deathDate": "P570",
    "country": "P27",
    "countryISOCode": "P297",
    "hasPart": "P527",
    "human": "Q5",
    "nativeName": "P1705",
    "musicalGroup": "Q215380"
}

wdDataType = {
    "name": {"type": "externalString"},
    "birthName": {"type": "inValue", "targetProperty": "text"},
    "surname": {"type": "externalString"},
    "birthDate": {"type": "date", "format": "+%Y-%m-%dT%H:%M:%SZ"},
    "deathDate": {"type": "date", "format": "+%Y-%m-%dT%H:%M:%SZ"},
    "country": {"type": "country"},
}

humanKeys = ["name", "surname", "birthName", "birthDate", "deathDate", "country"]
groupKeys = ["hasPart"]


In [None]:
def getDataValue(dictObject, keyProperty=None, expectArray=False):
    arrValues = []
    
    dictPropObject = dictObject if keyProperty is None else dictObject[keyProperty]

    if not dictPropObject is None:
        for keyValue in dictPropObject:
            valueToAppend = keyValue["mainsnak"]["datavalue"]["value"]
            arrValues.append(valueToAppend)
    else:
        return None

    return arrValues[0] if not expectArray else arrValues

def getSubsetOfKeys(dictObject, keys):
    global wdGetEntity

    resultObject = {}

    for key in keys:
        if wdMap[key] in dictObject.keys():
            resultObject[key] = dictObject[wdMap[key]]
        else:
            resultObject[key] = None

    return resultObject

def getEntityTitle(entity):
    try:
        return entity["labels"]["en"]["value"]
    except:
        return entity["labels"][list(entity["labels"])[0]]["value"]


In [None]:
def getArtistInformation(entityClaims):
    global humanKeys, wdMap, wdDataType

    artistInformation = getSubsetOfKeys(entityClaims, humanKeys)
    artistExtractedInformation = {}

    for keyValue in artistInformation:
        currentInformation = getDataValue(artistInformation, keyValue)

        realValue = None
        if not currentInformation is None:
            wdDataInformation = wdDataType[keyValue]
            if wdDataInformation["type"] == "externalString":
                realValue = getEntityTitle(wdGetEntity(currentInformation["id"]))

            elif wdDataInformation["type"] == "country":
                realValue = getDataValue(wdGetProperty(
                    entityID=currentInformation["id"],
                    propertyID=wdMap["countryISOCode"]
                ))
                
            elif wdDataInformation["type"] == "inValue":
                realValue = currentInformation[wdDataInformation["targetProperty"]]

            elif wdDataInformation["type"] == "date":
                try:
                    realValue = getDateFromInformation(currentInformation["time"], wdDataInformation["format"])
                except:
                    realValue = None

        artistExtractedInformation[keyValue] = realValue
    
    return artistExtractedInformation


In [None]:
peopleDF = None
peopleCols = ["id", "name", "surname", "birthdate", "deathdate", "nationality", "artist", "complete_name", "entity_name"]


In [None]:
# Setup DataFrames
peopleDF = pd.DataFrame([], columns=peopleCols)

In [None]:
def generatePeopleObject(peopleInfo):
    completeName = peopleInfo["birthName"] if not peopleInfo["birthName"] is None else peopleInfo["entityName"]
    name = peopleInfo["name"]
    surname = peopleInfo["surname"]

    if name is None and not surname is None and not completeName is None:
        surnamePosition = completeName.find(surname) - 1
        name = completeName[:surnamePosition].strip()
    elif not name is None and surname is None and not completeName is None:
        namePosition = completeName.find(name) + len(name)
        surname = completeName[namePosition:].strip()
    elif name is None and surname is None and not completeName is None:
        namePosition = completeName.find(" ")
        surnamePosition = completeName.find(" ")
        if namePosition >= 0:
            name = completeName[:surnamePosition].strip()
            surname = completeName[namePosition:].strip()
        else:
            name = completeName

    # Get and setup the artist information needed
    peopleObject = {
        "id": peopleInfo["wdID"],
        "name": name,
        "surname": surname,
        "birthdate": peopleInfo["birthDate"],
        "deathdate": peopleInfo["deathDate"],
        "nationality": peopleInfo["country"],
        "artist": peopleInfo["artistID"],
        "complete_name": peopleInfo["birthName"],
        "entity_name": peopleInfo["entityName"],
    }

    return peopleObject


In [None]:
# Iterate over the album DataFrame
index = 0

for artistID, row in artists.iterrows():
    artistName = row["name"]

    print("\n\n[[ " + artistName + " ]]")

    wdQueryResults = wdQuery(artistName)

    if not wdQueryResults is None and "query" in wdQueryResults.keys():
        wdQueryResults = wdQueryResults["query"]["search"]
    else:
        print("[NO RESULTS]")
        continue

    if len(wdQueryResults) <= 0:
        print("[NO RESULTS]")
        continue

    artistResults = []
    print("Search Results")
    for qPos, queryResult in zip(range(len(wdQueryResults)), wdQueryResults):
        querySnippet = queryResult["snippet"].lower()
        keyWords = ["dj", "rapper", "singer", "songwriter", "duo",
                    "trio", "group", "band", "orchestra", "producer",
                    "music", "artist"]
        bannedKeyWords = ["album", "discography", "single", "song "]

        keyFindResults = [querySnippet.find(keyWord) for keyWord in keyWords]
        bannedKeyFindResults = [querySnippet.find(keyWord) for keyWord in bannedKeyWords]

        hasMatch = False
        matchIndex = None
        for i, keyFindRes in zip(range(len(keyFindResults)), keyFindResults):
            if keyFindRes >= 0:
                hasMatch = True

                if matchIndex is None or i < matchIndex:
                    matchIndex = i

        for keyFindRes in bannedKeyFindResults:
            if keyFindRes >= 0:
                hasMatch = False

        hasMatchStr = "--> " if hasMatch else "- "
        print(hasMatchStr + querySnippet)

        if hasMatch:
            artistResults.append(
                {"res": queryResult, "matchIndex": matchIndex, "queryPosition": qPos})
    
    artistResults = sorted(artistResults,
                           key=lambda artRes: artRes["matchIndex"])

    if len(artistResults) <= 0:
        artistResult = wdQueryResults[0]
    else:
        artistResult = artistResults[0]["res"]
    
    print("\n# FINAL CHOOSE: " + artistResult["snippet"])

    entityID = artistResult["title"]

    entityObject = wdGetEntity(entityID)
    entityClaims = entityObject["claims"]
    entityName = getEntityTitle(entityObject)

    artistMembers = []

    if wdMap["instanceOf"] in entityClaims.keys() and getDataValue(entityClaims, wdMap["instanceOf"])["id"] == wdMap["human"]:
        print("\n[ single artist ]")
        artistInformation = getArtistInformation(entityClaims)
        artistInformation["artistID"] = artistID
        artistInformation["artistName"] = artistName
        artistInformation["entityName"] = entityName
        artistInformation["wdID"] = entityID

        artistMembers.append(artistInformation)
        
    elif wdMap["hasPart"] in entityClaims.keys():
        print("\n[ members ]")
        groupInformation = getSubsetOfKeys(entityClaims, groupKeys)
        groupMembers = getDataValue(groupInformation, "hasPart", expectArray=True)

        for groupMember in groupMembers:
            memberEntity = wdGetEntity(groupMember["id"])
            memberEntityClaims = memberEntity["claims"]
            memberEntityName = getEntityTitle(memberEntity)
            
            artistInformation = getArtistInformation(memberEntityClaims)
            artistInformation["entityName"] = memberEntityName
            artistInformation["artistID"] = artistID
            artistInformation["wdID"] = groupMember["id"]

            artistMembers.append(artistInformation)
    
    peopleObjList = []
    for peopleInfo in artistMembers:
        peopleObject = generatePeopleObject(peopleInfo)
        
        print(peopleObject["entity_name"] + ": " + json.dumps(peopleObject))
        
        peopleObjList.append(list(peopleObject.values()))

    # Create rows DataFrame for the people
    peopleObjDF = pd.DataFrame(peopleObjList, columns=peopleCols)

    # Add the people info to the DataFrame
    peopleDF = pd.concat([peopleDF, peopleObjDF], ignore_index=True)

    # Print stats every 5000 tracks
    if index % 5000 == 0:
        print("\n\n👨 [STATUS INFO #{row}]".format(row=index))
        print(peopleDF.info())

    # Save DataFrame to file every 100 tracks
    if index % 100 == 0:
        print("\n\n💾 [STATUS INFO #{row}] Dataset saved\n".format(row=index))
        peopleDF.to_csv(peoplePath)
    
    index += 1

# Print info about the DataFrames
print(peopleDF.info())


In [None]:
# Save datasets to file
peopleDF.to_csv(peoplePath)
