# Deep Music Classification

[introductory paragraph]

### Introduction

### Data Preparation

In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [3]:
df["genre"].unique()

array(['pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'],
      dtype=object)

In [4]:
genres = {
    "pop": 0,
    "country" : 1, 
    "blues": 2,
    "jazz": 3,
    "reggae": 4,
    "rock": 5,
    "hip hop": 6
}
df["genre"] = df["genre"].apply(genres.get)
df.head(5)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,0,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,0,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,0,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,0,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,0,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


What would be the baseline accuracy for our model?

In [5]:
# baseline accuracy
df.groupby("genre").size() / len(df)

genre
0    0.248202
1    0.191915
2    0.162273
3    0.135521
4    0.088045
5    0.142182
6    0.031862
dtype: float64

### Neural Network 1: Lyrics

If our model always predicts a song to be pop, it would achieve 25% accuracy. Let's see if we can beat this using neural networks.

In [6]:
from torch.utils.data import Dataset, DataLoader

class TextDataFromDF(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __getitem__(self, index):
        return self.df.iloc[index, 5], self.df.iloc[index, 4], self.df.iloc[index, 6:28] #add a third item for features

    def __len__(self):
        return len(self.df)                

In [7]:
df_train, df_val = train_test_split(df,shuffle = True, test_size = 0.2)
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

In [8]:
train_data[3]

('deep heart lie song dream live memory beneath star alamo enchantment strange blue moonlit path know hear break song moon splendor know heart lips sweet tender like petals fallin apart speak break song word know live heart moonlit pass alamo',
 1,
 len                               40
 dating                      0.001698
 violence                    0.001698
 world/life                  0.089684
 night/time                  0.001698
 shake the audience          0.001698
 family/gospel               0.001698
 romantic                    0.065207
 communication               0.086887
 obscene                     0.001698
 music                       0.217831
 movement/places             0.001698
 light/visual perceptions    0.181755
 family/spiritual            0.001698
 like/girls                  0.001698
 sadness                     0.336565
 feelings                    0.001698
 danceability                0.736814
 loudness                    0.734559
 acousticness                

In [9]:
tokenizer = get_tokenizer('basic_english')
tokenized = tokenizer(train_data[194][0])
tokenized[0:10]

['love',
 'love',
 'little',
 'girl',
 'blue',
 'fellow',
 'go',
 'leave',
 'come',
 'best']

In [10]:
def yield_tokens(data_iter):
    for text, features, _ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

First couple elements of the vocabulary

In [11]:
vocab.get_itos()[0:10]

['<unk>',
 'know',
 'like',
 'time',
 'come',
 'go',
 'heart',
 'away',
 'yeah',
 'feel']

Tokenized vocabulary

In [12]:
vocab(tokenized)[0:10]

[62, 62, 37, 47, 55, 2087, 5, 15, 4, 148]

In [13]:
max_len = 30
num_tokens = len(vocab.get_itos())
def text_pipeline(x):
    tokens = vocab(tokenizer(x))
    y = torch.zeros(max_len, dtype=torch.int64) + num_tokens
    if len(tokens) > max_len:
        tokens = tokens[0:max_len]
    y[0:len(tokens)] = torch.tensor(tokens,dtype=torch.int64)
    return y

label_pipeline = lambda x: int(x)

In [14]:
def collate_batch(batch):
    label_list, text_list, feature_list = [], [], []
    for (_text, _label, _features) in batch:
         
         # featire pipeline
         feature_list.append(torch.tensor(_features))

        # add label to list
         label_list.append(label_pipeline(_label))

         # add text (as sequence of integers) to list
         processed_text = text_pipeline(_text)
         text_list.append(processed_text)

    feature_list = torch.stack(feature_list)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return text_list, label_list, feature_list

In [15]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

In [16]:
from torch import nn
import torch.nn.functional as F

class TextClassificationModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.dropout = nn.Dropout(p=0.2)
        self.fc   = nn.Linear(embedding_dim, num_class)  

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = x.mean(axis=1)  
        x = self.fc(x)
        return x

Learn and Train a model!

In [17]:
vocab_size = len(vocab)
embedding_dim = 3
max_len = 100
num_class = 7

lyrics_model = TextClassificationModel(vocab_size, embedding_dim, max_len, num_class)

optimizer = torch.optim.Adam(lyrics_model.parameters(), lr=.1)
loss_fn = torch.nn.CrossEntropyLoss()

In [18]:
import time

def train(model, dataloader, lyrics, engineering):
    # keep track of time for each epoch
    epoch_start_time = time.time()
    log_interval = 300
    start_time = time.time()

    # for measuring accuracy
    total_acc, total_count = 0, 0

    for idx, (text, label, features) in enumerate(dataloader):
        # zero gradients
        optimizer.zero_grad()

        # prediction on batch, based on specified features 
        predicted_label = ''
        if(engineering and not lyrics):
            predicted_label = model(features)
        elif(lyrics and not engineering):
            predicted_label = model(text)
        elif(lyrics and engineering):
            predicted_label = model(text, features)

        # evaluate loss on prediction
        loss = loss_fn(predicted_label, label)

        # compute gradient
        loss.backward()

        # take an optimization step
        optimizer.step()

        # for printing accuracy
        total_acc   += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')
    
def evaluate(model, dataloader, lyrics=True, engineering=True):

    # for determining accuracy
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label, features) in enumerate(dataloader):

            # form prediction on batch
            if(engineering and not lyrics):
                predicted_label = model(features)
            elif(lyrics and not engineering):
                predicted_label = model(text)
            elif(lyrics and engineering):
                predicted_label = model(text, features)

            # compute accuracy
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            
    return total_acc/total_count

In [19]:
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    train(lyrics_model, train_loader, lyrics = True, engineering = False)

  feature_list.append(torch.tensor(_features))


| epoch   1 | train accuracy    0.289 | time: 10.95s
| epoch   2 | train accuracy    0.396 | time: 15.47s
| epoch   3 | train accuracy    0.460 | time: 16.23s
| epoch   4 | train accuracy    0.516 | time: 16.93s
| epoch   5 | train accuracy    0.562 | time: 14.68s


In [20]:
evaluate(lyrics_model, val_loader, lyrics = True, engineering = False)

  feature_list.append(torch.tensor(_features))


0.32405286343612333

### Neural Network 2: Engineered Features

In [21]:
engineered_features = ['genre', 'dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy']      
len(engineered_features)

23

In [22]:
df_train, df_val = train_test_split(df,shuffle = True, test_size = 0.2)
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

In [33]:
from torch import nn

class EngineeringClassificationModel(nn.Module):
    def __init__(self, input_size, num_class):
        super().__init__()
        self.model = nn.Sequential( 
            nn.Linear(input_size, 128), 
            nn.ReLU(), 
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(), 
            nn.Dropout(0.2),
            nn.Linear(64, 32), 
            nn.ReLU(), 
            nn.Dropout(0.2),
            nn.Linear(32, 16), 
            nn.ReLU(), 
            nn.Linear(16, num_class), 
            nn.Softmax(dim=1) )

    def forward(self, x):
        x = x.float()
        x = torch.flatten(x, 1)
        x = self.model(x)
        return x

In [34]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

In [35]:
input_size = 22
num_classes = 7
engineer_model = EngineeringClassificationModel(input_size, num_classes)

optimizer = torch.optim.Adam(engineer_model.parameters(), lr=.0001)
loss_fn = torch.nn.CrossEntropyLoss()

In [36]:
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    train(engineer_model, train_loader, lyrics=False, engineering=True)

  feature_list.append(torch.tensor(_features))


| epoch   1 | train accuracy    0.239 | time: 11.25s
| epoch   2 | train accuracy    0.247 | time: 17.06s
| epoch   3 | train accuracy    0.247 | time: 16.51s


In [27]:
evaluate(engineer_model, val_loader, lyrics = False, engineering = True)

  feature_list.append(torch.tensor(_features))


0.24986784140969162

### Neural Network 3: Engineered Featuers 

In [28]:
class CombinedModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, num_features, num_classes):
        super().__init__()

        # separate data into text features and engineered features

        # Text Pipeline
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.text_fc = nn.Linear(embedding_dim, 128)

        # Engineered Features Pipeline
        self.engineered_fc = nn.Linear(num_features, 128)

        # Combined Layers
        self.combine_fc = nn.Linear(12928, 64)
        self.output_fc = nn.Linear(64, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text, engineer):
        # separate x into x_1 (text features) and x_2 (engineered features)

        # text pipeline: try embedding! 
        text_embed = self.embedding(text)
        x_1 = self.text_fc(text_embed)
        x_1 = torch.flatten(x_1, 1)

        # engineered features: fully-connected Linear layers are fine
        engineer = engineer.float()
        x_2 = self.engineered_fc(engineer)

        # ensure that both x_1 and x_2 are 2-d tensors, flattening if necessary
        combined = torch.cat((x_1, x_2), dim=1)

        # pass x through a couple more fully-connected layers and return output
        combined = self.combine_fc(combined)
        output = self.output_fc(combined)
        output = self.softmax(output)

        return output

In [29]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

In [31]:
vocab_size = len(vocab)
embedding_dim = 3
num_features = 22
num_classes = 7

combined_model = CombinedModel(vocab_size, embedding_dim, num_features, num_classes)

optimizer = torch.optim.Adam(combined_model.parameters(), lr=0.0001)
loss_fn = torch.nn.CrossEntropyLoss()

In [32]:
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    train(combined_model, train_loader, lyrics=True, engineering=True)

  feature_list.append(torch.tensor(_features))


| epoch   1 | train accuracy    0.247 | time: 25.58s


KeyboardInterrupt: 

In [None]:
evaluate(combined_model, val_loader, lyrics = True, engineering = True)

  feature_list.append(torch.tensor(_features))


0.24334801762114538

### Visualize Word Embedding

In [None]:
# for embedding visualization later
import plotly.express as px 
import plotly.io as pio
import numpy as np


In [None]:
embedding_matrix = combined_model.embedding.cpu().weight.data.numpy()
tokens = vocab.get_itos()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
weights = pca.fit_transform(embedding_matrix)

In [None]:
tokens = vocab.get_itos()
tokens.append(" ")
embedding_df = pd.DataFrame({
    'word' : tokens, 
    'x0'   : weights[:,0],
    'x1'   : weights[:,1]
})

embedding_df

Unnamed: 0,word,x0,x1
0,<unk>,2.462491,0.251168
1,know,3.076616,2.376554
2,like,-0.348808,8.657647
3,time,-2.000949,-1.609269
4,come,0.752938,-3.041938
...,...,...,...
45839,트램펄린,-1.292385,-1.170023
45840,한번쯤은,-0.071745,0.306504
45841,함께라는,-1.114518,-1.150573
45842,ﬁnished,0.339692,0.091119


In [None]:
fig = px.scatter(embedding_df, 
                 x = "x0", 
                 y = "x1", 
                 size = list(np.ones(len(embedding_df))),
                 size_max = 10,
                 hover_name = "word")

fig.show()

In [None]:
feminine = ["she", "her", "woman"]
masculine = ["he", "him", "man"]

highlight_1 = ["strong", "powerful", "smart",     "thinking", "brave", "muscle"]
highlight_2 = ["hot",    "sexy",     "beautiful", "shopping", "children", "thin"]

def gender_mapper(x):
    if x in feminine:
        return 1
    elif x in masculine:
        return 4
    elif x in highlight_1:
        return 3
    elif x in highlight_2:
        return 2
    else:
        return 0

embedding_df["highlight"] = embedding_df["word"].apply(gender_mapper)
embedding_df["size"]      = np.array(1.0 + 50*(embedding_df["highlight"] > 0))

# 
sub_df = embedding_df[embedding_df["highlight"] > 0]

In [None]:
import plotly.express as px 

fig = px.scatter(sub_df, 
                 x = "x0", 
                 y = "x1", 
                 color = "highlight",
                 size = list(sub_df["size"]),
                 size_max = 10,
                 hover_name = "word", 
                 text = "word")

fig.update_traces(textposition='top center')

fig.show()

### Conclusion