# In this file I add meta data to the lyrics dataset

In [None]:
import pandas as pd
import re
import numpy as np

### Read in lyrics dataset and meta data

In [None]:
# lyrics dataset provided by the Wasabi Research Group
songs_old = pd.read_csv("./csv-data/lyrics.csv", sep=",", engine="python", escapechar='\\', encoding='utf-8')

# Meta Data from the public Wasabi Github page: https://github.com/micbuffa/WasabiDataset
meta_genre = pd.read_csv('./csv-data/wasabi_albums.csv', sep=",", engine="python", encoding="utf-8")
meta_year = pd.read_csv("./csv-data/wasabi_songs.csv", sep="\t", engine="python", 
                        encoding='utf-8', usecols=['title', 'publicationDate', 'publicationDateAlbum'])

# Meta Data from the Million Song Dataset
# Website: http://millionsongdataset.com/sites/default/files/AdditionalFiles/tracks_per_year.txt
df = pd.read_csv('./csv-data/data.txt', sep="<SEP>", header=None, names=["year", "id", "a_name", "a_song"])

## Genre annotations

### Merge columns via album titles to get genre

In [None]:
# select needed columns
meta_sub = meta_genre.loc[ : , ['title', 'genre'] ] #'title' refers to the album title
songs_sub = songs_old.loc[ : , ['a_name', 'a_song', 'a_album', 'a_lyrics'] ]

#rename for merge
songs_sub.rename(columns={'a_album': 'title'}, inplace=True)

meta_sub['title'] = meta_sub['title'].astype(str)
songs_sub['title'] = songs_sub['title'].astype(str)

# function for lowering String and deleting special characters as well as white spaces
def alter(x):
    return re.sub('[^A-Za-z0-9]+', '', x)

# Delete special characters or whitespaces for merging (needed)
meta_sub['title'] = meta_sub['title'].apply(alter)
songs_sub['title'] = songs_sub['title'].apply(alter)
meta_sub.drop_duplicates(subset='title', inplace=True)

# merge
songs = pd.merge(songs_sub, meta_sub, on='title', how='left')

a.head(10)

## Publication Dates annotations

### Add album publication Dates via 'wasabi_songs.csv' 
### Add song publication Dates (years) via 'milliong song data'

In [None]:
# delete numberin of columns
songs.drop(songs.columns[[0]], axis=1, inplace=True)


meta_year['title'] = meta_year['title'].astype(str) # here title refers to song title
songs['a_song'] = songs['a_song'].astype(str)

# function for lowering String and deleting special characters as well as white spaces
def alter(x):
    return re.sub('[^A-Za-z0-9]+', '', x).lower()

# Delete special characters or whitespaces and lower text for merging (needed)
meta_year['title'] = meta_year['title'].apply(alter)
songs['a_song'] = songs['a_song'].apply(alter)

meta_year.drop_duplicates(subset='title', inplace=True)
meta_year.rename(columns={'title': 'a_song'}, inplace=True) # rename for merge

#merge for album publication Dates
m = pd.merge(songs, meta_year, on='a_song', how='left')

# Delete special characters or whitespaces and lower text for merging (needed)
m['a_song'] = m['a_song'].apply(alter)
m['a_name'] = m['a_name'].apply(alter)
df['a_song'] = df['a_song'].apply(alter)
df['a_name'] = df['a_name'].apply(alter)

# merge on  song name + artist name for song publication Dates
songs = pd.merge(m, df, left_on=['a_song','a_name'], right_on=['a_song','a_name'], how='left')

# If no song publication date available for a song -> replace with album publication date
# This way we can obtain a approx. 'publication date' for every song
songs.loc[songs['year'].isnull(),'year'] = songs['publicationDateAlbum']

# delete publication dates for albums as we no longer need them
del songs['publicationDateAlbum']

#one entry has the value '????'. This value has to be deleted
songs.loc[songs['year'] == "????",'year'] = np.nan

songs.head()