<a href="https://colab.research.google.com/github/menduv/mtg-Jamendo/blob/main/MTG_importing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/MTG/mtg-jamendo-dataset.git

#Reading genre dictionary


In [1]:
import pandas as pd

# Specify the file path
file_path = '/content/mtg-jamendo-dataset/data/autotagging_genre.tsv'

# Initialize an empty list to store the data
data_list = []

# Open the file and read it line by line
with open(file_path, 'r') as file:
    for line in file:
        # Split the line based on tabs
        parts = line.strip().split('\t')

        # Combine the values in the last part if there are more than 6 parts
        if len(parts) > 6:
            parts[5] = ' '.join(parts[5:])
            del parts[6:]

        # Append the line to the data list
        data_list.append(parts)

# Create a DataFrame from the list
genre_df = pd.DataFrame(data_list)

genre_df.columns = genre_df.iloc[0]
genre_df = genre_df[1:]

# Display the DataFrame
# print(genre_df)


## Manually checking if the head, tail and multi tag line items have been imported

In [2]:
genre_df.head()

Unnamed: 0,TRACK_ID,ARTIST_ID,ALBUM_ID,PATH,DURATION,TAGS
1,track_0000214,artist_000014,album_000031,14/214.mp3,124.6,genre---punkrock
2,track_0000215,artist_000014,album_000031,15/215.mp3,151.4,genre---metal
3,track_0000216,artist_000014,album_000031,16/216.mp3,234.9,genre---metal
4,track_0000217,artist_000014,album_000031,17/217.mp3,127.9,genre---punkrock
5,track_0000218,artist_000014,album_000031,18/218.mp3,180.7,genre---punkrock


In [3]:
genre_df.tail()

Unnamed: 0,TRACK_ID,ARTIST_ID,ALBUM_ID,PATH,DURATION,TAGS
55211,track_1422056,artist_496314,album_165847,56/1422056.mp3,516.0,genre---soundtrack
55212,track_1422057,artist_496314,album_165847,57/1422057.mp3,374.9,genre---soundtrack
55213,track_1422058,artist_496314,album_165847,58/1422058.mp3,315.8,genre---soundtrack
55214,track_1422059,artist_496314,album_165847,59/1422059.mp3,201.3,genre---soundtrack
55215,track_1422060,artist_496314,album_165847,60/1422060.mp3,336.0,genre---soundtrack


In [4]:
genre_df[37045:37055]

Unnamed: 0,TRACK_ID,ARTIST_ID,ALBUM_ID,PATH,DURATION,TAGS
37046,track_1187363,artist_434945,album_142925,63/1187363.mp3,118.0,genre---classical genre---orchestral genre---s...
37047,track_1187420,artist_434945,album_142925,20/1187420.mp3,109.8,genre---classical genre---orchestral genre---s...
37048,track_1187421,artist_434945,album_142925,21/1187421.mp3,117.4,genre---classical genre---orchestral genre---s...
37049,track_1187479,artist_434945,album_142925,79/1187479.mp3,90.5,genre---classical genre---orchestral genre---s...
37050,track_1187638,artist_005365,album_142937,38/1187638.mp3,235.5,genre---pop genre---popfolk genre---singersong...
37051,track_1187639,artist_005365,album_142937,39/1187639.mp3,194.5,genre---pop genre---popfolk genre---singersong...
37052,track_1187640,artist_005365,album_142937,40/1187640.mp3,212.3,genre---folk genre---popfolk genre---singerson...
37053,track_1187641,artist_005365,album_142937,41/1187641.mp3,187.9,genre---pop genre---popfolk genre---singersong...
37054,track_1187642,artist_005365,album_142937,42/1187642.mp3,214.6,genre---pop genre---popfolk genre---singersong...
37055,track_1187705,artist_459855,album_142908,05/1187705.mp3,136.1,genre---ambient genre---classical genre---newwave


## Converting the column names to lower case

In [5]:
genre_df.columns = list(map(str.lower, genre_df.columns))

## Striping the values in the dataframe and removing whitespace

In [6]:
genre_df = genre_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

## Changing the dtype of the columns

In [7]:
genre_df.dtypes

track_id     object
artist_id    object
album_id     object
path         object
duration     object
tags         object
dtype: object

### Getting rid of the prefixes of track_id, artist_id and album_id

In [8]:
def remove_prefix(df, column_name, prefix):
  all_values_start_with_str = df[column_name].str.startswith(prefix).all()

  if all_values_start_with_str:
    df[column_name] = df[column_name].str.replace(prefix, '')

  return df

In [9]:
genre_df = remove_prefix(genre_df, 'track_id', 'track_')

In [10]:
genre_df = remove_prefix(genre_df, 'artist_id', 'artist_')

In [11]:
genre_df = remove_prefix(genre_df, 'album_id', 'album_')

In [12]:
genre_df

Unnamed: 0,track_id,artist_id,album_id,path,duration,tags
1,0000214,000014,000031,14/214.mp3,124.6,genre---punkrock
2,0000215,000014,000031,15/215.mp3,151.4,genre---metal
3,0000216,000014,000031,16/216.mp3,234.9,genre---metal
4,0000217,000014,000031,17/217.mp3,127.9,genre---punkrock
5,0000218,000014,000031,18/218.mp3,180.7,genre---punkrock
...,...,...,...,...,...,...
55211,1422056,496314,165847,56/1422056.mp3,516.0,genre---soundtrack
55212,1422057,496314,165847,57/1422057.mp3,374.9,genre---soundtrack
55213,1422058,496314,165847,58/1422058.mp3,315.8,genre---soundtrack
55214,1422059,496314,165847,59/1422059.mp3,201.3,genre---soundtrack


In [13]:
genre_df.dtypes

track_id     object
artist_id    object
album_id     object
path         object
duration     object
tags         object
dtype: object

In [14]:
num_cols = ['track_id', 'artist_id', 'album_id']

genre_df[num_cols] = genre_df[num_cols].apply(pd.to_numeric)

In [15]:
genre_df.dtypes

track_id      int64
artist_id     int64
album_id      int64
path         object
duration     object
tags         object
dtype: object

In [16]:
genre_df

Unnamed: 0,track_id,artist_id,album_id,path,duration,tags
1,214,14,31,14/214.mp3,124.6,genre---punkrock
2,215,14,31,15/215.mp3,151.4,genre---metal
3,216,14,31,16/216.mp3,234.9,genre---metal
4,217,14,31,17/217.mp3,127.9,genre---punkrock
5,218,14,31,18/218.mp3,180.7,genre---punkrock
...,...,...,...,...,...,...
55211,1422056,496314,165847,56/1422056.mp3,516.0,genre---soundtrack
55212,1422057,496314,165847,57/1422057.mp3,374.9,genre---soundtrack
55213,1422058,496314,165847,58/1422058.mp3,315.8,genre---soundtrack
55214,1422059,496314,165847,59/1422059.mp3,201.3,genre---soundtrack


In [17]:
from google.colab import  drive
drive.mount('/drive')
# genre_df.to_csv('/drive/My Drive/colab_data/genre_df.csv')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


# Importing raw_30s_cleantags_50artists.tsv

In [18]:
# Specify the file path
file_path = '/content/mtg-jamendo-dataset/data/raw_30s_cleantags_50artists.tsv'

# Initialize an empty list to store the data
data_list = []

# Open the file and read it line by line
with open(file_path, 'r') as file:
    for line in file:
        # Split the line based on tabs
        parts = line.strip().split('\t')

        # Combine the values in the last part if there are more than 6 parts
        if len(parts) > 6:
            parts[5] = ' '.join(parts[5:])
            del parts[6:]

        # Append the line to the data list
        data_list.append(parts)

# Create a DataFrame from the list
df = pd.DataFrame(data_list)

df.columns = df.iloc[0]
df = df[1:]

In [19]:
df.head()

Unnamed: 0,TRACK_ID,ARTIST_ID,ALBUM_ID,PATH,DURATION,TAGS
1,track_0000214,artist_000014,album_000031,14/214.mp3,124.6,genre---punkrock
2,track_0000215,artist_000014,album_000031,15/215.mp3,151.4,genre---metal
3,track_0000216,artist_000014,album_000031,16/216.mp3,234.9,genre---metal
4,track_0000217,artist_000014,album_000031,17/217.mp3,127.9,genre---punkrock
5,track_0000218,artist_000014,album_000031,18/218.mp3,180.7,genre---punkrock


In [20]:
df.tail()

Unnamed: 0,TRACK_ID,ARTIST_ID,ALBUM_ID,PATH,DURATION,TAGS
55605,track_1422056,artist_496314,album_165847,56/1422056.mp3,516.0,genre---soundtrack instrument---computer mood/...
55606,track_1422057,artist_496314,album_165847,57/1422057.mp3,374.9,genre---soundtrack instrument---computer mood/...
55607,track_1422058,artist_496314,album_165847,58/1422058.mp3,315.8,genre---soundtrack instrument---computer mood/...
55608,track_1422059,artist_496314,album_165847,59/1422059.mp3,201.3,genre---soundtrack instrument---computer mood/...
55609,track_1422060,artist_496314,album_165847,60/1422060.mp3,336.0,genre---soundtrack instrument---computer mood/...


In [21]:
df[37045:37055]

Unnamed: 0,TRACK_ID,ARTIST_ID,ALBUM_ID,PATH,DURATION,TAGS
37046,track_1180386,artist_457845,album_142044,86/1180386.mp3,292.2,genre---dance genre---electronic genre---elect...
37047,track_1180388,artist_457845,album_142044,88/1180388.mp3,216.8,genre---dance genre---electronic genre---elect...
37048,track_1180401,artist_457845,album_142044,01/1180401.mp3,246.9,genre---dance genre---electronic genre---elect...
37049,track_1180402,artist_457845,album_142044,02/1180402.mp3,210.1,genre---dance genre---electronic genre---elect...
37050,track_1180588,artist_007349,album_142065,88/1180588.mp3,330.4,genre---dance genre---hiphop genre---rap mood/...
37051,track_1180593,artist_007349,album_142065,93/1180593.mp3,267.9,genre---hiphop genre---rap genre---rnb
37052,track_1180599,artist_007349,album_142065,99/1180599.mp3,284.5,genre---hiphop genre---rap genre---rnb
37053,track_1180601,artist_007349,album_142065,01/1180601.mp3,271.1,genre---hiphop genre---rap genre---rnb
37054,track_1180612,artist_007349,album_142065,12/1180612.mp3,343.8,genre---hiphop genre---pop genre---rnb
37055,track_1180613,artist_007349,album_142065,13/1180613.mp3,236.6,genre---club genre---dance genre---pop mood/th...


In [22]:
df.columns = list(map(str.lower, df.columns))

In [23]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [24]:
df = remove_prefix(df, 'track_id', 'track_')
df = remove_prefix(df, 'artist_id', 'artist_')
df = remove_prefix(df, 'album_id', 'album_')

In [25]:
df

Unnamed: 0,track_id,artist_id,album_id,path,duration,tags
1,0000214,000014,000031,14/214.mp3,124.6,genre---punkrock
2,0000215,000014,000031,15/215.mp3,151.4,genre---metal
3,0000216,000014,000031,16/216.mp3,234.9,genre---metal
4,0000217,000014,000031,17/217.mp3,127.9,genre---punkrock
5,0000218,000014,000031,18/218.mp3,180.7,genre---punkrock
...,...,...,...,...,...,...
55605,1422056,496314,165847,56/1422056.mp3,516.0,genre---soundtrack instrument---computer mood/...
55606,1422057,496314,165847,57/1422057.mp3,374.9,genre---soundtrack instrument---computer mood/...
55607,1422058,496314,165847,58/1422058.mp3,315.8,genre---soundtrack instrument---computer mood/...
55608,1422059,496314,165847,59/1422059.mp3,201.3,genre---soundtrack instrument---computer mood/...


In [26]:
df.iloc[-1].at['tags']

'genre---soundtrack instrument---computer mood/theme---advertising mood/theme---dramatic mood/theme---epic mood/theme---movie'

In [18]:
# df.to_csv('/drive/My Drive/colab_data/raw_df.csv')