In [3]:
import pandas as pd

## Read data

FMA repository is [here](https://github.com/mdeff/fma)

- Download the metadata file [here](https://os.unil.cloud.switch.ch/fma/fma_metadata.zip) and unzip to `data-FMA` folder
- `tracks.csv` contains album, name, genre, and tags
- only a portion have complete tag information

## Read FMA tracks subset

In [4]:
filePath = "data-processed/tracks-tags.csv"
df = pd.read_csv(filePath, header=[0,1], index_col=0)
df.head()

Unnamed: 0_level_0,artist,album,track,track,track,track
Unnamed: 0_level_1,name,title,title,genres,genres_all,tags
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
137,Airway,Live at LACE,Side A,"[1, 32]","[32, 1, 38]",['lafms']
138,Airway,Live at LACE,Side B,"[1, 32]","[32, 1, 38]",['lafms']
850,Human Host,Exploding Demon,Tomb Of Science,[12],[12],['baltimore']
851,Human Host,Exploding Demon,Six Realms,[12],[12],['baltimore']
852,Human Host,Exploding Demon,Escape From the Organ Chamber,[12],[12],['baltimore']


## Note: this is a multiindex dataframe

Some of examples of how to select columns:

In [5]:
df.columns # See what the column names are

MultiIndex([('artist',       'name'),
            ( 'album',      'title'),
            ( 'track',      'title'),
            ( 'track',     'genres'),
            ( 'track', 'genres_all'),
            ( 'track',       'tags')],
           )

The column names are [tuples](https://docs.python.org/3/tutorial/datastructures.html#tuples-and-sequences) (like lists that cannot be modified).

To access a column:

In [6]:
df[("album", "title")]

track_id
137                    Live at LACE
138                    Live at LACE
850                 Exploding Demon
851                 Exploding Demon
852                 Exploding Demon
                    ...            
155269                     Volatile
155275                     Dog Wave
155276                     Dog Wave
155277                     Dog Wave
155320    What I Tell Myself Vol. 2
Name: (album, title), Length: 23496, dtype: object

To access multiple columns:

In [7]:
df[[("album", "title"), ("track", "tags")]]

Unnamed: 0_level_0,album,track
Unnamed: 0_level_1,title,tags
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2
137,Live at LACE,['lafms']
138,Live at LACE,['lafms']
850,Exploding Demon,['baltimore']
851,Exploding Demon,['baltimore']
852,Exploding Demon,['baltimore']
...,...,...
155269,Volatile,"['dark ambient', 'dark', 'ambient', 'noise', '..."
155275,Dog Wave,"['noise', 'stretching is magic', 'free music',..."
155276,Dog Wave,"['noise', 'stretching is magic', 'free music',..."
155277,Dog Wave,"['noise', 'stretching is magic', 'free music',..."


## TODO: replace or map genre codes with names

ref: pandas [replace](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html#pandas.DataFrame.replace) or [map](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.map.html#pandas.DataFrame.map)

In [8]:
# Read in genres.csv

df_genres = pd.read_csv("data-FMA/genres.csv")
df_genres.head()

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5


In [9]:
dict_map = {}

for index, row in df_genres.iterrows():
    dict_map[row["genre_id"]] = row["title"]

dict_map

{1: 'Avant-Garde',
 2: 'International',
 3: 'Blues',
 4: 'Jazz',
 5: 'Classical',
 6: 'Novelty',
 7: 'Comedy',
 8: 'Old-Time / Historic',
 9: 'Country',
 10: 'Pop',
 11: 'Disco',
 12: 'Rock',
 13: 'Easy Listening',
 14: 'Soul-RnB',
 15: 'Electronic',
 16: 'Sound Effects',
 17: 'Folk',
 18: 'Soundtrack',
 19: 'Funk',
 20: 'Spoken',
 21: 'Hip-Hop',
 22: 'Audio Collage',
 25: 'Punk',
 26: 'Post-Rock',
 27: 'Lo-Fi',
 30: 'Field Recordings',
 31: 'Metal',
 32: 'Noise',
 33: 'Psych-Folk',
 36: 'Krautrock',
 37: 'Jazz: Vocal',
 38: 'Experimental',
 41: 'Electroacoustic',
 42: 'Ambient Electronic',
 43: 'Radio Art',
 45: 'Loud-Rock',
 46: 'Latin America',
 47: 'Drone',
 49: 'Free-Folk',
 53: 'Noise-Rock',
 58: 'Psych-Rock',
 63: 'Bluegrass',
 64: 'Electro-Punk',
 65: 'Radio',
 66: 'Indie-Rock',
 70: 'Industrial',
 71: 'No Wave',
 74: 'Free-Jazz',
 76: 'Experimental Pop',
 77: 'French',
 79: 'Reggae - Dub',
 81: 'Afrobeat',
 83: 'Nerdcore',
 85: 'Garage',
 86: 'Indian',
 88: 'New Wave',
 89: 'P

## TODO: parse the cells in the track-tags column

ref: python [strings](https://docs.python.org/3/library/string.html#module-string) and the [re](https://docs.python.org/3/library/re.html#module-re) package

- practice re [here](https://regex101.com)

In [10]:
def myFunction(aString):

    return aString.split()

x = "[1, 15, 40]"

myFunction(x)

['[1,', '15,', '40]']

Experiment with functions for replacing numbers with words:

In [11]:
def replace_numbers(input_string, replacement_dict):
    result = ''
    for char in input_string:
        if char.isdigit():
            result += replacement_dict.get(int(char), char)
        else:
            result += char
    return result

# Example usage:
replacement_dict = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five'}
input_string = "I have 123 apples and 45 oranges."

output_string = replace_numbers(input_string, replacement_dict)
print(output_string)

I have OneTwoThree apples and FourFive oranges.


Create a dataframe with only track id and genre information:

In [12]:
df[("track", "genres")]

track_id
137              [1, 32]
138              [1, 32]
850                 [12]
851                 [12]
852                 [12]
               ...      
155269    [42, 107, 183]
155275      [15, 32, 38]
155276      [15, 32, 38]
155277      [15, 32, 38]
155320     [10, 12, 169]
Name: (track, genres), Length: 23496, dtype: object

In [13]:
df_track_genres = df[[
    ("track", "genres"),
    ("track", "genres_all")
]]
df_track_genres.head()

Unnamed: 0_level_0,track,track
Unnamed: 0_level_1,genres,genres_all
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2
137,"[1, 32]","[32, 1, 38]"
138,"[1, 32]","[32, 1, 38]"
850,[12],[12]
851,[12],[12]
852,[12],[12]


Change the data type from object into string so it can be used in string function:

In [14]:
df_track_genres.dtypes

track  genres        object
       genres_all    object
dtype: object

In [15]:
df_track_genres[("track", "genres")] = df_track_genres.loc[:,[("track", "genres")]].astype("string")

In [16]:
df_track_genres.loc[:, ("track", "genres")] = df_track_genres[("track", "genres")].astype("string")

In [17]:
df_track_genres.dtypes

track  genres        string[python]
       genres_all            object
dtype: object

Create function to split number string in genre column to individual substrings:

In [18]:
import ast

def split_numbers(input_string):
    # Use ast.literal_eval to safely evaluate the string as a Python literal (list)
    numbers_list = ast.literal_eval(input_string)
    
    # Convert each element in the list to a string
    numbers_str = [str(number) for number in numbers_list]
    
    # Map numbers in list to genre word (for..in)
    newList = [ ]
    
    for x in numbers_str:
        newList.append(dict_map[int(x)])
    
    return newList

# Input string
input_string = "(1, 5, 9)"

# Call the split_numbers function
result = split_numbers(input_string)

# Print the result
print(result)


['Avant-Garde', 'Classical', 'Country']


In [19]:
data = {'column': ['9', '10', '11']}
df_test = pd.DataFrame(data)
df_test

df_test["column"] = df_test["column"].map(dict_map)

df_test

Unnamed: 0,column
0,
1,
2,


Map the split numbers function to track id and genre dataframe:

In [20]:
df_track_genres[("track", "genres")].map(split_numbers)

track_id
137                        [Avant-Garde, Noise]
138                        [Avant-Garde, Noise]
850                                      [Rock]
851                                      [Rock]
852                                      [Rock]
                          ...                  
155269    [Ambient Electronic, Ambient, Glitch]
155275        [Electronic, Noise, Experimental]
155276        [Electronic, Noise, Experimental]
155277        [Electronic, Noise, Experimental]
155320                  [Pop, Rock, Rockabilly]
Name: (track, genres), Length: 23496, dtype: object

Create new dataframe with genre_word and track_id columns:

In [55]:
df_genre_words = df_track_genres[("track", "genres")].map(split_numbers)
df_genre_words
df_genre_words.to_frame()

Unnamed: 0_level_0,track
Unnamed: 0_level_1,genres
track_id,Unnamed: 1_level_2
137,"[Avant-Garde, Noise]"
138,"[Avant-Garde, Noise]"
850,[Rock]
851,[Rock]
852,[Rock]
...,...
155269,"[Ambient Electronic, Ambient, Glitch]"
155275,"[Electronic, Noise, Experimental]"
155276,"[Electronic, Noise, Experimental]"
155277,"[Electronic, Noise, Experimental]"


Group by genre words:

In [30]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
df_genre_words.dtypes

dtype('O')

In [59]:
df_genre_words.apply(word_tokenize)
df_genre_words['genrewords'] = df_genre_words[("track", "genres")].apply(word_tokenize)

TypeError: expected string or bytes-like object, got 'list'

In [58]:
df_track_genres[("track", "genres")].apply(word_tokenize)

df_track_genres['genrewords'] = df_track_genres[("track", "genres")].apply(word_tokenize)

df_track_genres

Unnamed: 0_level_0,track,track,genrewords
Unnamed: 0_level_1,genres,genres_all,Unnamed: 3_level_1
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
137,"[1, 32]","[32, 1, 38]","[[, 1, ,, 32, ]]"
138,"[1, 32]","[32, 1, 38]","[[, 1, ,, 32, ]]"
850,[12],[12],"[[, 12, ]]"
851,[12],[12],"[[, 12, ]]"
852,[12],[12],"[[, 12, ]]"
...,...,...,...
155269,"[42, 107, 183]","[42, 107, 15, 1235, 183]","[[, 42, ,, 107, ,, 183, ]]"
155275,"[15, 32, 38]","[32, 38, 15]","[[, 15, ,, 32, ,, 38, ]]"
155276,"[15, 32, 38]","[32, 38, 15]","[[, 15, ,, 32, ,, 38, ]]"
155277,"[15, 32, 38]","[32, 38, 15]","[[, 15, ,, 32, ,, 38, ]]"
