In [40]:
import pandas as pd


In [41]:
# Load dataset
song_df = pd.read_csv('../dataset/songs_normalize.csv')


In [42]:
# Get list of all genres
genre_set = set()
for genres in song_df['genre'].values:
    genre_list = genres.split(',')
    for genre in genre_list:
        genre_set.add(genre.strip())
print(genre_set)

{'pop', 'country', 'metal', 'R&B', 'World/Traditional', 'Dance/Electronic', 'jazz', 'blues', 'set()', 'Folk/Acoustic', 'hip hop', 'latin', 'classical', 'easy listening', 'rock'}


In [43]:
# Create a copy of the set to add a prefix 'is' to columns
# but make sure to ignore set() as this means there is no
# genre associated with it
is_genre_set = set()
for genre in genre_set:
    if genre == 'set()':
        continue
    is_genre_set.add('is' + genre)

In [44]:
# Populate a dictionary with the genres as keys and a list of True/False as values
# depending on whether the song is of that genre or not
# Make sure to ignore 'set()'
genre_dict = dict.fromkeys(is_genre_set, 0)
for genre in genre_set:
    if genre == 'set()':
        continue
    is_genre = []
    for genres in song_df['genre'].values:
        if genre in genres:
            is_genre.append(True)
        else:
            is_genre.append(False)
    genre_dict['is' + genre] = is_genre

In [45]:
# Check output of genre_dict and convert to a data frame
for genre, is_genre in genre_dict.items():
    print(f'{genre}: {is_genre[:5]}')

genres_df = pd.DataFrame(genre_dict)
genres_df.head()

islatin: [False, False, False, False, False]
iseasy listening: [False, False, False, False, False]
iscountry: [False, False, True, False, False]
isrock: [False, True, False, True, False]
isblues: [False, False, False, False, False]
isjazz: [False, False, False, False, False]
ismetal: [False, False, False, True, False]
iship hop: [False, False, False, False, False]
isclassical: [False, False, False, False, False]
isWorld/Traditional: [False, False, False, False, False]
ispop: [True, True, True, False, True]
isDance/Electronic: [False, False, False, False, False]
isFolk/Acoustic: [False, False, False, False, False]
isR&B: [False, False, False, False, False]


Unnamed: 0,islatin,iseasy listening,iscountry,isrock,isblues,isjazz,ismetal,iship hop,isclassical,isWorld/Traditional,ispop,isDance/Electronic,isFolk/Acoustic,isR&B
0,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,True,False,False,False,False,False,False,True,False,False,False
2,False,False,True,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,True,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [46]:
# Append the data frame to the existing one
song_updated_df = song_df.copy()
song_updated_df = pd.concat([song_updated_df, genres_df], axis=1)
song_updated_df.head()


Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,...,isblues,isjazz,ismetal,iship hop,isclassical,isWorld/Traditional,ispop,isDance/Electronic,isFolk/Acoustic,isR&B
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,...,False,False,False,False,False,False,True,False,False,False
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,...,False,False,False,False,False,False,True,False,False,False
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,...,False,False,False,False,False,False,True,False,False,False
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,...,False,False,True,False,False,False,False,False,False,False
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,...,False,False,False,False,False,False,True,False,False,False


In [47]:
# Add a hasFeature column
# Cannot use just 'with' as this returns all song titles with 'with' in the actual
# song title. So instead use '(with' which seems to be more accurate
feature_keywords = {'feature', 'feat', 'ft', 'featuring', '(with', 'vs', 'vs.'}

has_feature = []
for song_title in song_df['song']:
    has_feature.append(any(keyword in song_title for keyword in feature_keywords))

In [48]:
# Append new data to existing data frame
song_updated_df['hasFeature'] = has_feature

In [49]:
# Write updated data frame to CSV
song_updated_df.to_csv('../dataset/songs_updated.csv', index=False)