# Simple Classifiers and preprocessing of the data

In [1]:
import sys
sys.path.append('..')

import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from src.plot_utils import hist, count, scatter
from dotenv import load_dotenv

SEED = 42

## Importing of the data and spliting

next, we will import the dataset:

In [2]:
CSV_PATH = '../data/chartex_clean.csv'

df = pd.read_csv(CSV_PATH)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4447 entries, 0 to 4446
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   track_name                    4446 non-null   object 
 1   track_pop                     4447 non-null   int64  
 2   artist                        4446 non-null   object 
 3   artist_pop                    4447 non-null   int64  
 4   album                         4446 non-null   object 
 5   length                        4447 non-null   int64  
 6   danceability                  4443 non-null   float64
 7   energy                        4443 non-null   float64
 8   key                           4443 non-null   float64
 9   loudness                      4443 non-null   float64
 10  mode                          4443 non-null   float64
 11  speechiness                   4443 non-null   float64
 12  acousticness                  4443 non-null   float64
 13  ins

Let's fill all missing data with the expectation:

In [3]:
df = df.drop(['track_name', 'artist', 'album', 'id', 'song_name', 'artist_name'], axis = 1)
means = df.mean()
df.fillna(value=means, inplace=True)

next, we will split the target feature (track_pop) from the rest of the features and split to train and test sets. In addition, we will replace track_pop with a new binary feature that indicate if a track is popular according to track_pop and threshold of our choice and it will be out target feature for **classification**:

In [4]:
from sklearn.model_selection import train_test_split 

X, y = df.drop(['track_pop', 'length'], axis = 1).values , df['track_pop'].values
#for classification:
# 50 is the threshold.
y = (y > 50).astype('int32')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = SEED, test_size=0.25)

Let's investigate a bit the train and test sets:

In [5]:
print("Train:")
print("#samples in train:", y_train.shape[0])
print("#popular samples in train:", y_train.sum())

print("\nTest:")
print("#samples in test:", y_test.shape[0])
print("#popular samples in test:", y_test.sum())

Train:
#samples in train: 3335
#popular samples in train: 1461

Test:
#samples in test: 1112
#popular samples in test: 484


## Simple classifiers:

In [6]:
from sklearn.model_selection import cross_validate

def print_accuracy(model, X_train, X_test, y_train, y_test):
    y_pred = model.predict(X_train)
    print("train accuracy =",(y_train == y_pred).mean())

    y_pred = model.predict(X_test)
    print("test accuracy =",(y_test == y_pred).mean())
"""
def cross_validation(X, y, cv, modelConstructor, list_of_config, const_config = None):
    accuracy_per_value = []

    for config in list_of_config:
        if const_config is None:
            model = modelConstructor(**config)
        else:
            model = modelConstructor(**const_config , **config)
        cv_result = cross_validate(model, X, y, cv=cv, scoring= ('accuracy'))
        accuracy_per_value.append(cv_result['test_score'].mean())
        
    return accuracy_per_value
"""
splitted_data = (X_train, X_test, y_train, y_test)

### Logistic regression:

Now we are ready to train the models.

We will start with Logistic regression with l2 regularization:

In [61]:
from sklearn import linear_model

logit_lin_l2_model = linear_model.LogisticRegression(C = 5)
logit_lin_l2_model.fit(X_train,y_train)

In [63]:
print_accuracy(logit_lin_l2_model, *splitted_data)

train accuracy = 0.5631184407796102
test accuracy = 0.5584532374100719


We can see that the model is just better than random classifier.

Considering that we used the model on the raw data it is not suprising that the model did bad job in predicting the popularity.

Let's try decision trees!

### Decision Trees:

Let's put regularization by forcing all leaves in the tree to have at least 10 samples from the training set:

In [9]:
from sklearn import tree

tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', min_samples_split = 10)

tree_model.fit(X_train, y_train)

In [10]:
print_accuracy(tree_model, *splitted_data)

train accuracy = 0.9472263868065967
test accuracy = 0.6223021582733813


Has we can see from the result, the model suffers from overfitting. So, let's do cross validation on min_samples_split:

In [11]:
from sklearn import tree

list_of_mins = [10,50,100,300,500,1000]
accuracy_per_value = []

for min_leaf in list_of_mins:
    tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', min_samples_split = min_leaf)
    cv_result = cross_validate(tree_model, X_train, y_train, cv=3, scoring= ('accuracy'))
    accuracy_per_value.append(cv_result['test_score'].mean())

optim_min = list_of_mins[np.argmax(accuracy_per_value)]
tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', min_samples_split = optim_min)
tree_model.fit(X_train, y_train)

And then, the accuracies of the model with the optimal min_samples_split is:

In [12]:
print_accuracy(tree_model, *splitted_data)

train accuracy = 0.6962518740629685
test accuracy = 0.6276978417266187


Still not a good result. We need to remember that we are still working on the row data and didn't do any feature mapping

### AdaBoost classifier:

Let's play with AdaBoost a bit:

In [7]:
from sklearn.ensemble import AdaBoostClassifier

#the base model is stump - decision tree with maximal depth of 1.
adaboost_model = AdaBoostClassifier(n_estimators=50, learning_rate=20)
adaboost_model.fit(X_train, y_train)

In [8]:
print_accuracy(adaboost_model, *splitted_data)

train accuracy = 0.6251874062968515
test accuracy = 0.6276978417266187


Let's see if Cross validation can help it:

In [18]:
estimators = [10, 25, 50]
lr = [0.01, 0.1, 1]
accuracy_per_value = []

for n_estimator in estimators:
    for rate in lr:
        adaboost_model = AdaBoostClassifier(n_estimators=n_estimator, learning_rate= rate)
        cv_result = cross_validate(adaboost_model, X_train, y_train, cv=3, scoring= ('accuracy'))
        accuracy_per_value.append(cv_result['test_score'].mean())

optim_idx = np.argmax(accuracy_per_value)
adaboost_model = AdaBoostClassifier(n_estimators= estimators[optim_idx//len(lr)], learning_rate= lr[optim_idx%len(lr)])
adaboost_model.fit(X_train, y_train)

In [20]:
print_accuracy(adaboost_model, *splitted_data)

train accuracy = 0.672263868065967
test accuracy = 0.6447841726618705


### MLP model:

Let's check how many features we have:

In [8]:
input_dim = X_train.shape[1]
output_dim = 2
train_samples = X_train.shape[0]

print("input dim:", input_dim)
print("number of train samples:", train_samples)
print("number of test samples:", X_test.shape[0])

input dim: 17
number of train samples: 3335
number of test samples: 1112


Now, let's create the net!

In [22]:
import torch
import torch.nn as nn

mlp_model = nn.Sequential(
    nn.Linear(input_dim, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, output_dim),
    nn.Softmax(dim=1)
)

print(mlp_model)

Sequential(
  (0): Linear(in_features=17, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=2, bias=True)
  (5): Softmax(dim=1)
)


In [20]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(mlp_model.parameters(), lr=1e-4)

In [11]:
X_train = torch.tensor(X_train, dtype = torch.float32)
y_train = torch.tensor(y_train, dtype = torch.long)

X_test = torch.tensor(X_test, dtype = torch.float32)
y_test = torch.tensor(y_test, dtype = torch.long)


In [12]:
batch_size = 128

num_epochs = 30
num_batches = train_samples//batch_size

samples_in_epoch = batch_size * num_batches

In [23]:
#The training loop
for epoch in range(num_epochs):
    tot_loss = 0.
    tot_correct = 0.
    perm = torch.randperm(train_samples)
    X_train = X_train[perm,:]
    y_train = y_train[perm]
    for batch_idx in range(num_batches):
        X_batch = X_train[batch_idx*batch_size:(batch_idx+1)*batch_size,:]
        y_batch = y_train[batch_idx*batch_size:(batch_idx+1)*batch_size]
        y_pred = mlp_model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        tot_loss += loss.item()
        tot_correct += (torch.argmax(y_pred,dim=1) == y_batch).sum().item()
    print("epoch number #", epoch, ":")
    print("loss:", tot_loss/num_batches)
    print("accuracy:", tot_correct/samples_in_epoch,"\n") 

epoch number # 0 :
loss: 0.8736584392877725
accuracy: 0.43960336538461536 

epoch number # 1 :
loss: 0.8730574823342837
accuracy: 0.4402043269230769 

epoch number # 2 :
loss: 0.8736584324103135
accuracy: 0.43960336538461536 

epoch number # 3 :
loss: 0.8730574731643383
accuracy: 0.4402043269230769 

epoch number # 4 :
loss: 0.8733579539335691
accuracy: 0.43990384615384615 

epoch number # 5 :
loss: 0.8733579470561101
accuracy: 0.43990384615384615 

epoch number # 6 :
loss: 0.8730574823342837
accuracy: 0.4402043269230769 

epoch number # 7 :
loss: 0.8733579608110281
accuracy: 0.43990384615384615 

epoch number # 8 :
loss: 0.8724564955784724
accuracy: 0.44080528846153844 

epoch number # 9 :
loss: 0.8733579516410828
accuracy: 0.43990384615384615 

epoch number # 10 :
loss: 0.8733579241312467
accuracy: 0.43990384615384615 

epoch number # 11 :
loss: 0.8736584415802588
accuracy: 0.43960336538461536 

epoch number # 12 :
loss: 0.8736584347027999
accuracy: 0.43960336538461536 

epoch number

In [None]:
#Save the entire model:
#torch.save(model, "my_model.pickle")
#model = torch.load("my_model.pickle")

#Save only the weights: (recommended)
#torch.save(model.state_dict(), "my_model.pickle")
#model = nn.Sequential(...)
#model.load_state_dict(torch.load("my_model.pickle"))