# Deep Music Classification

### Introduction

In [141]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [142]:
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [143]:
df["genre"].unique()

array(['pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'],
      dtype=object)

In [144]:
genres = {
    "pop": 0,
    "country" : 1, 
    "blues": 2,
    "jazz": 3,
    "reggae": 4,
    "rock": 5,
    "hip hop": 6
}
df["genre"] = df["genre"].apply(genres.get)
df.head(5)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,0,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,0,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,0,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,0,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,0,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


What would be the baseline accuracy for our model?

In [145]:
# baseline accuracy
df.groupby("genre").size() / len(df)

genre
0    0.248202
1    0.191915
2    0.162273
3    0.135521
4    0.088045
5    0.142182
6    0.031862
dtype: float64

### Neural Network 1: Lyrics

If our model always predicts a song to be pop, it would achieve 25% accuracy. Let's see if we can beat this using neural networks.

In [146]:
from torch.utils.data import Dataset, DataLoader

class TextDataFromDF(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __getitem__(self, index):
        return self.df.iloc[index, 5], self.df.iloc[index, 4], self.df.iloc[index, 6:28] #add a third item for features

    def __len__(self):
        return len(self.df)                

In [147]:
df_train, df_val = train_test_split(df,shuffle = True, test_size = 0.2)
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

In [148]:
train_data[3]

('head bald head bald business buyin head bald head bald business buyin head bald head bald business buyin',
 2,
 len                               18
 dating                      0.005263
 violence                    0.605263
 world/life                  0.005263
 night/time                  0.005263
 shake the audience          0.005263
 family/gospel               0.305263
 romantic                    0.005263
 communication               0.005263
 obscene                     0.005263
 music                       0.005263
 movement/places             0.005263
 light/visual perceptions    0.005263
 family/spiritual            0.005263
 like/girls                  0.005263
 sadness                     0.005263
 feelings                    0.005263
 danceability                 0.36532
 loudness                    0.671256
 acousticness                0.000117
 instrumentalness             0.16498
 valence                     0.341509
 Name: 14464, dtype: object)

In [149]:
tokenizer = get_tokenizer('basic_english')
tokenized = tokenizer(train_data[194][0])
tokenized[0:10]

['strange',
 'shape',
 'light',
 'night',
 'see',
 'real',
 'black',
 'lips',
 'seal',
 'fantasy']

In [150]:
def yield_tokens(data_iter):
    for text, features, _ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

First couple elements of the vocabulary

In [151]:
vocab.get_itos()[0:10]

['<unk>',
 'know',
 'like',
 'time',
 'come',
 'go',
 'heart',
 'away',
 'yeah',
 'feel']

Tokenized vocabulary

In [152]:
vocab(tokenized)[0:10]

[493, 859, 66, 18, 136, 81, 71, 265, 1191, 917]

In [153]:
max_len = 30
num_tokens = len(vocab.get_itos())
def text_pipeline(x):
    tokens = vocab(tokenizer(x))
    y = torch.zeros(max_len, dtype=torch.int64) + num_tokens
    if len(tokens) > max_len:
        tokens = tokens[0:max_len]
    y[0:len(tokens)] = torch.tensor(tokens,dtype=torch.int64)
    return y

label_pipeline = lambda x: int(x)

In [154]:
def collate_batch(batch):
    label_list, text_list, feature_list = [], [], []
    for (_text, _label, _features) in batch:
         
         # featire pipeline
         feature_list.append(torch.tensor(_features))

        # add label to list
         label_list.append(label_pipeline(_label))

         # add text (as sequence of integers) to list
         processed_text = text_pipeline(_text)
         text_list.append(processed_text)

    feature_list = torch.stack(feature_list)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return text_list, label_list, feature_list

In [155]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

In [156]:
from torch import nn

class TextClassificationModel(nn.Module):
    
    def __init__(self,vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.fc   = nn.Linear(max_len*embedding_dim, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return(x)

Learn and Train a model!

In [157]:
vocab_size = len(vocab)
embedding_dim = 3
max_len = 100
num_class = 7

lyrics_model = TextClassificationModel(vocab_size, embedding_dim, max_len, num_class)

optimizer = torch.optim.Adam(lyrics_model.parameters(), lr=.1)
loss_fn = torch.nn.CrossEntropyLoss()

In [158]:
import time

def train(model, dataloader, lyrics, engineering):
    epoch_start_time = time.time()
    # keep track of some counts for measuring accuracy
    total_acc, total_count = 0, 0
    log_interval = 300
    start_time = time.time()

    for idx, (text, label, features) in enumerate(dataloader):
        # zero gradients
        optimizer.zero_grad()

        # form prediction on batch
        predicted_label = ''
        if(engineering and not lyrics):
            predicted_label = model(features)
        elif(lyrics and not engineering):
            predicted_label = model(text)
        elif(lyrics and engineering):
            predicted_label = model(text, features)

        # evaluate loss on prediction
        loss = loss_fn(predicted_label, label)

        # compute gradient
        loss.backward()

        # take an optimization step
        optimizer.step()

        # for printing accuracy
        total_acc   += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')
    
def evaluate(model, dataloader, lyrics=True, engineering=True):
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label, features) in enumerate(dataloader):
            # form prediction on batch
            if(engineering and not lyrics):
                predicted_label = model(features)
            elif(lyrics and not engineering):
                predicted_label = model(text)
            elif(lyrics and engineering):
                predicted_label = model(text, features)

            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [159]:
EPOCHS = 1
for epoch in range(1, EPOCHS + 1):
    train(lyrics_model, train_loader, lyrics = True, engineering = False)

  feature_list.append(torch.tensor(_features))


| epoch   1 | train accuracy    0.177 | time: 11.13s


In [160]:
#evaluate(lyrics_model, val_loader, lyrics = True, engineering = True)

### Neural Network 2: Engineered Features

In [161]:
engineered_features = ['genre', 'dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy']      
len(engineered_features)

23

In [162]:
df_train, df_val = train_test_split(df,shuffle = True, test_size = 0.2)
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

In [163]:
from torch import nn

class EngineeringClassificationModel(nn.Module):
    def __init__(self, input_size, num_class):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, num_class),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x = x.float()
        x = torch.flatten(x, 1)
        x = self.model(x)
        return x

In [164]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

In [165]:
input_size = 22
num_classes = 7
engineer_model = EngineeringClassificationModel(input_size, num_classes)

optimizer = torch.optim.Adam(engineer_model.parameters(), lr=.1)
loss_fn = torch.nn.CrossEntropyLoss()

In [166]:
EPOCHS = 1
for epoch in range(1, EPOCHS + 1):
    train(engineer_model, train_loader, lyrics=False, engineering=True)

  feature_list.append(torch.tensor(_features))


| epoch   1 | train accuracy    0.248 | time: 14.95s
