# URL GENERATION

### IMPORTS 

In [None]:
import pandas as pd
import re
import numpy as np
import json

### READING IN THE DATA

In [180]:
data = pd.read_excel('fede_data.xlsx')

In [181]:
data = data[data.year != "rik"]
data = data[data.year.astype(int) >= 2000]
data

Unnamed: 0,year,category,nominee,workers,winner,url
1,2019,"Best Latin Rock, Urban or Alternative Album",Aztlán,"Zoé (artist/producer), Phil Vinall (producer),...",True,
3,2010,Best Musical Album for Children,Family Time,"Ziggy Marley (producer/artist), Don Was (produ...",True,
5,2007,Best Reggae Album,Love Is My Religion,"Ziggy Marley (artist), Marc Moreau (engineer/m...",True,
7,2019,Best Reggae Album,Rebellion Rises,Ziggy Marley (artist),False,
8,2017,Best Reggae Album,Ziggy Marley,Ziggy Marley,True,
...,...,...,...,...,...,...
6318,2019,"Producer of the Year, Classical",Dirk Sobotka (producer),,False,
6319,2019,"Producer of the Year, Classical",Judith Sherman (producer),,False,
6320,2019,"Producer of the Year, Non-Classical",Pharrell Williams,,True,
6321,2019,"Producer of the Year, Non-Classical",Kanye West (producer),,False,


### DATA CLEANING

In [182]:
data["url"] = data.url.fillna(0) # change NaNs to zero

In [184]:
data["album"] = [True if re.match(r".*Album.*",x) else False for x in data.category]
data = data[data.album == True].drop("album",axis = 1).reset_index(drop = True) # keep only the albums
data = data[~data.category.str.contains("Classical")].reset_index(drop = True) # remove classical music
data.workers = data.workers.astype(str) # changing column type
data.head(10)

Unnamed: 0,year,category,nominee,workers,winner,url
0,2019,"Best Latin Rock, Urban or Alternative Album",Aztlán,"Zoé (artist/producer), Phil Vinall (producer),...",True,0
1,2010,Best Musical Album for Children,Family Time,"Ziggy Marley (producer/artist), Don Was (produ...",True,0
2,2007,Best Reggae Album,Love Is My Religion,"Ziggy Marley (artist), Marc Moreau (engineer/m...",True,0
3,2019,Best Reggae Album,Rebellion Rises,Ziggy Marley (artist),False,0
4,2017,Best Reggae Album,Ziggy Marley,Ziggy Marley,True,0
5,2007,Best Electronic/Dance Album,The Garden,Zero 7,False,0
6,2018,Best Contemporary Christian Music Album,Chain Breaker,Zach Williams (artist),True,0
7,2019,Best Contemporary Christian Music Album,Survivor: Live from Harding Prison,Zach Williams,False,0
8,2005,Best Compilation Soundtrack Album for a Motion...,Garden State,Zach Braff (compilation producer),True,0
9,2010,Best Country Album,The Foundation.,Zac Brown Band,False,0


In [185]:
data.shape

(1500, 6)

In [187]:
# Establishing whether the URLs are strange cases, which are then be generated by hand
data["strange_cases"] = data.workers.apply(lambda x: True if (not re.search(r"\)",x) and re.search(r",",x)) else False)
data = data.sort_values("year").reset_index(drop = True)

Unnamed: 0,year,category,nominee,workers,winner,url,strange_cases
0,2019,"Best Latin Rock, Urban or Alternative Album",Aztlán,"Zoé (artist/producer), Phil Vinall (producer),...",True,0,False
1,2010,Best Musical Album for Children,Family Time,"Ziggy Marley (producer/artist), Don Was (produ...",True,0,False
2,2007,Best Reggae Album,Love Is My Religion,"Ziggy Marley (artist), Marc Moreau (engineer/m...",True,0,False
3,2019,Best Reggae Album,Rebellion Rises,Ziggy Marley (artist),False,0,False
4,2017,Best Reggae Album,Ziggy Marley,Ziggy Marley,True,0,False
...,...,...,...,...,...,...,...
1495,2012,Best Spoken Word Album,Fab Fan Memories - The Beatles Bond (Various).,,False,0,False
1496,2013,Best Pop Instrumental Album,Rumbadoodle (Artist: Arun Shenoy),,False,0,False
1497,2013,Best Spoken Word Album,American Grown (Michelle Obama).,,False,0,False
1498,2015,Best Large Jazz Ensemble Album,The L.A. Treasures Project.,,False,0,False


Function to perform some basic cleaning the name of the artist and the name of the album 

In [189]:
def clean(s):
    #Regex cleaning
    s = re.sub(r"^\s|\s$", "", s) # replace spaces at the start and at the end of the string
    s = re.sub(r"\s","-",s) # replace normal spaces with dashes
    s = re.sub(r"\.","",s) #remove dots
    s = re.sub(r"&","and",s) # replace & with "and"
    s = re.sub(r"[áàâäāãåăąÀÁÂÄĀÃÅĂĄ]","a",s) # Replace accents
    s = re.sub(r"[éèêëēėęĚĒÈÉÊËĖĘ]","e",s)
    s = re.sub(r"[íìîïīįīĨÌÍÎÏĪĮ]","i",s)
    s = re.sub(r"[óòôöōõøőÓÒÔÖŌÕØŐ]","o",s)
    s = re.sub(r"[úùûüūųÚÙÛÜŪŲ]","u",s)
    s = re.sub(r"[çĉćċčÇĈĆĊČ]","c",s)
    s = re.sub(r"[ñńňņŉŋÑŃŇŅŉŊ]","n",s)
    s = s.lower() #lower case 

    # deal with particular cases
    if s == "ti":
        s = "t_i"
    elif s == "nati-cano's-mariachi-los-camperos":
        s = "mariachi-los-camperos-de-nati-cano"
    elif s == "george-carlin-it's-bad-for-ya!":
        s = "george-carlin"
    elif s == "ferdinand-'jelly-roll'-morton":
        s = "jelly-roll-morton"
    elif s == "":
        s = ""
    s = re.sub(r"['!:]","",s) # remove extra punctuation

    if s == "kalani-pea":
        s = "kalani-pe'a"
    
    
    return s

In [190]:
def clean_total(l):

    """
    Parameters: l --> list of artist names to be cleaned 
    Output: The function looks for the string "(artist)" and all its variations, then takes the 
            name in front of the string if there is an artist, otherwise it returns the whole name
            passed through the clean() function
    """

    new_artist = []

    # FIRST ROUND OF CLEANING
    for i in l:
        if re.search(r".*\(.*artist.*\).*",str(i)): 
            name = " ".join((re.findall(r".*(?=\s\(.*artist.*\))",str(i))))
            if re.search(r"\),",name): # dealing with cases where the artist is not in the front of the string
                #print("hello")
                name = " ".join(re.findall(r"(?<=,)[^,]+$",name)) # take everything after the comma
                name = clean(name)
                new_artist.append(name)
            else:
                name = clean(name)
                new_artist.append(name)

        elif re.search(r"\(",i):
            name = " ".join(re.findall(r"^.*?(?<=\()",i))
            name = re.sub(r"\(","",name)
            name = clean(name)
            new_artist.append(name)
        else:
            name = clean(i)
            new_artist.append(name)
    return new_artist

['duke-ellington']

Cleaning the nominee name and the album name

In [192]:
data["workers_clean"] = clean_total(data.workers)
data["nominee_semiclean"] = data.nominee.astype(str).apply(lambda x: clean(x)).tolist()


### URL generation and additional cleaning

In [198]:
data["url_final"] = ["https://rateyourmusic.com/release/album/" + data.workers_clean[i] + "/" + data.nominee_semiclean[i] + "/" if data.url[i] == 0 else data.url[i] for i in range(len(data))]
data = data[data.url != "delete"]

In [205]:
data = data.drop(index = data[(data.duplicated(subset = ["year","nominee"], keep = False)) & (data.winner == False)].index)

Saving the files

In [207]:
list_urls = data.url_final.tolist()
with open("list_urls3.json","w") as f:
    json.dump(list_urls,f)