### Deep Music Classification

### Introduction

In [110]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

In [111]:
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [112]:
df["genre"].unique()

array(['pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'],
      dtype=object)

In [113]:
genres = {
    "pop": 0,
    "country" : 1, 
    "blues": 2,
    "jazz": 3,
    "reggae": 4,
    "rock": 5,
    "hip hop": 6
}
df["genre"] = df["genre"].apply(genres.get)
df.head(5)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,0,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,0,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,0,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,0,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,0,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


What would be the baseline accuracy for our model?

In [114]:
# baseline accuracy
df.groupby("genre").size() / len(df)

genre
0    0.248202
1    0.191915
2    0.162273
3    0.135521
4    0.088045
5    0.142182
6    0.031862
dtype: float64

### Neural Network 1: Lyrics

If our model always predicts a song to be pop, it would achieve 25% accuracy. Let's see if we can beat this using neural networks.

In [115]:
from torch.utils.data import Dataset, DataLoader

class TextDataFromDF(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __getitem__(self, index):
        return self.df.iloc[index, 5], self.df.iloc[index, 4]

    def __len__(self):
        return len(self.df)                

In [116]:
df_train, df_val = train_test_split(df,shuffle = True, test_size = 0.2)
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

In [117]:
train_data[3]

('place remember life change forever better go remain place moments lovers friends recall dead live life love friends lovers compare memories lose mean think know lose affection people things go know stop think life know lose affection people things go know stop think life life',
 1)

In [161]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

tokenized = tokenizer(train_data[194][0])
tokenized[0:10]

['girls',
 'kind',
 'look',
 'listen',
 'tell',
 'hand',
 'come',
 'tell',
 'equal',
 'opportunity']

In [119]:
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

First couple elements of the vocabulary

In [120]:
vocab.get_itos()[0:10]

['<unk>',
 'know',
 'like',
 'time',
 'come',
 'go',
 'heart',
 'yeah',
 'feel',
 'away']

Tokenized vocabulary

In [160]:
vocab(tokenized)[0:10]

[293, 215, 27, 145, 17, 33, 4, 17, 1490, 2693]

In [122]:
max_len = 30
num_tokens = len(vocab.get_itos())
def text_pipeline(x):
    tokens = vocab(tokenizer(x))
    y = torch.zeros(max_len, dtype=torch.int64) + num_tokens
    if len(tokens) > max_len:
        tokens = tokens[0:max_len]
    y[0:len(tokens)] = torch.tensor(tokens,dtype=torch.int64)
    return y

label_pipeline = lambda x: int(x)

In [123]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:

        # add label to list
         label_list.append(label_pipeline(_label))

         # add text (as sequence of integers) to list
         processed_text = text_pipeline(_text)
         text_list.append(processed_text)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return text_list, label_list

In [124]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x2969625b0>

In [125]:
from torch import nn

class TextClassificationModel(nn.Module):
    
    def __init__(self,vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.fc   = nn.Linear(max_len*embedding_dim, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return(x)

Learn and Train a model!

In [126]:
vocab_size = len(vocab)
embedding_dim = 3

model = TextClassificationModel(vocab_size, embedding_dim, max_len, num_class = 7)

In [127]:
import time

optimizer = torch.optim.Adam(model.parameters(), lr=.1)
loss_fn = torch.nn.CrossEntropyLoss()

def train(dataloader):
    epoch_start_time = time.time()
    # keep track of some counts for measuring accuracy
    total_acc, total_count = 0, 0
    log_interval = 300
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        # zero gradients
        optimizer.zero_grad()
        # form prediction on batch
        predicted_label = model(text)
        # evaluate loss on prediction
        loss = loss_fn(predicted_label, label)
        # compute gradient
        loss.backward()
        # take an optimization step
        optimizer.step()

        # for printing accuracy
        total_acc   += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')
    
def evaluate(dataloader):

    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [129]:
'''
EPOCHS = 50
for epoch in range(1, EPOCHS + 1):
    train(train_loader)
'''

'\nEPOCHS = 50\nfor epoch in range(1, EPOCHS + 1):\n    train(train_loader)\n'

In [130]:
evaluate(val_loader)

0.19541850220264317

### Neural Network 2: Engineered Features

In [151]:
engineered_features = ['genre', 'dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy']      
len(engineered_features)

23

In [152]:
df2 = df[engineered_features]
df2.head()

Unnamed: 0,genre,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,...,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy
0,0,0.000598,0.063746,0.000598,0.000598,0.000598,0.048857,0.017104,0.263751,0.000598,...,0.000598,0.000598,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711
1,0,0.035537,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,0.001284,0.001284,...,0.051124,0.001284,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324
2,0,0.00277,0.00277,0.00277,0.00277,0.00277,0.00277,0.158564,0.250668,0.00277,...,0.00277,0.00277,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112
3,0,0.048249,0.001548,0.001548,0.001548,0.0215,0.001548,0.411536,0.001548,0.001548,...,0.001548,0.081132,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736
4,0,0.00135,0.00135,0.417772,0.00135,0.00135,0.00135,0.46343,0.00135,0.00135,...,0.029755,0.00135,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375


In [153]:
from sklearn.preprocessing import StandardScaler
# split into features and target
X = df2.drop("genre", axis = 1).values
y = df2['genre'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

# train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle = True, test_size = 0.2)

In [157]:
import tensorflow

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define neural network architecture
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation = 'relu'),
    Dense(32, activation= 'relu'),
    Dense(16, activation= 'relu'),
    Dense(1, activation= 'softmax')  # Adjust output layer units based on your genre data
])

# compile model
model.compile(optimizer = 'adam', loss = 'mse', metrics = ['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

print("accuracy" )
model.evaluate(X_val,y_val)[1]

Epoch 1/50




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 888us/step - accuracy: 0.1916 - loss: 4.8652 - val_accuracy: 0.1852 - val_loss: 4.8317
Epoch 2/50
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step - accuracy: 0.1993 - loss: 4.7518 - val_accuracy: 0.1852 - val_loss: 4.8317
Epoch 3/50
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 678us/step - accuracy: 0.1919 - loss: 4.9187 - val_accuracy: 0.1852 - val_loss: 4.8317
Epoch 4/50
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 672us/step - accuracy: 0.1936 - loss: 4.8834 - val_accuracy: 0.1852 - val_loss: 4.8317
Epoch 5/50
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 674us/step - accuracy: 0.1919 - loss: 4.8719 - val_accuracy: 0.1852 - val_loss: 4.8317
Epoch 6/50
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step - accuracy: 0.1949 - loss: 4.8677 - val_accuracy: 0.1852 - val_loss: 4.8317
Epoch 7/50
[1m568/568[0m 

0.19066078960895538