# Simple Classifiers and preprocessing of the data

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np


SEED = 42

## Importing of the data and spliting

next, we will import the dataset:

In [None]:
CSV_PATH = '../data/chartex_final.csv'

df = pd.read_csv(CSV_PATH)
df.info()

Let's fill all missing data with the expectation:

In [None]:
df = df.drop(['track_name', 'artist', 'album', 'id', 'song_name', 'artist_name'], axis = 1, errors = 'ignore')
means = df.mean()
df.fillna(value=means, inplace=True)

next, we will split the target feature (track_pop) from the rest of the features and split to train and test sets. In addition, we will replace track_pop with a new binary feature that indicate if a track is popular according to track_pop and threshold of our choice and it will be out target feature for **classification**:

In [None]:
from sklearn.model_selection import train_test_split

df = df.drop(['artist_pop', 'number_of_videos', 'number_of_videos_last_14days', 'total_likes_count', 'key',], axis = 1, errors = 'ignore')
columns = df.columns

X, y = df.drop(['track_pop'], axis=1, errors = 'ignore').values,  df['track_pop'].values

#for classification:
# 50 is the threshold.
y = (y > 50).astype('int32')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = SEED, test_size=0.25)

In [None]:
for i, column in enumerate(columns):
    if i == 0:
        print(f'Target: {column}')
    else:
        print(f'#{i-1} : {column}')

Let's investigate a bit the train and test sets:

In [None]:
print("Train:")
print("#samples in train:", y_train.shape[0])
print("#popular samples in train:", y_train.sum().item()/y_train.shape[0])

print("\nTest:")
print("#samples in test:", y_test.shape[0])
print("#popular samples in test:", y_test.sum().item()/y_test.shape[0])

## Simple classifiers:

In [None]:
from sklearn.model_selection import cross_validate

def print_accuracy(model, X_train, X_test, y_train, y_test):
    y_pred = model.predict(X_train)
    print("train accuracy =",(y_train == y_pred).mean())

    y_pred = model.predict(X_test)
    print("test accuracy =",(y_test == y_pred).mean())

splitted_data = (X_train, X_test, y_train, y_test)

### Logistic regression:

Now we are ready to train the models.

We will start with Logistic regression with l2 regularization:

In [None]:
from sklearn import linear_model

logit_lin_l2_model = linear_model.LogisticRegression(C = 5)
logit_lin_l2_model.fit(X_train,y_train)

In [None]:
print_accuracy(logit_lin_l2_model, *splitted_data)

We can see that the model is just better than random classifier.

Considering that we used the model on the raw data it is not suprising that the model did bad job in predicting the popularity.

Let's try decision trees!

### Decision Trees:

Let's put regularization by forcing all leaves in the tree to have at least 10 samples from the training set:

In [None]:
from sklearn import tree

tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', min_samples_split = 10)

tree_model.fit(X_train, y_train)

In [None]:
print(tree.export_text(tree_model))

In [None]:
print_accuracy(tree_model, *splitted_data)

Has we can see from the result, the model suffers from overfitting. So, let's do cross validation on min_samples_split:

In [None]:
from sklearn import tree

list_of_mins = [10,50,100,300,500,1000]
accuracy_per_value = []

for min_leaf in list_of_mins:
    tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', min_samples_split = min_leaf)
    cv_result = cross_validate(tree_model, X_train, y_train, cv=3, scoring= ('accuracy'))
    accuracy_per_value.append(cv_result['test_score'].mean())

optim_min = list_of_mins[np.argmax(accuracy_per_value)]
tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', min_samples_split = optim_min)
tree_model.fit(X_train, y_train)

And then, the accuracies of the model with the optimal min_samples_split is:

In [None]:
print_accuracy(tree_model, *splitted_data)

Still not a good result. We need to remember that we are still working on the row data and didn't do any feature mapping

### AdaBoost classifier:

Let's play with AdaBoost a bit:

In [None]:
from sklearn.ensemble import AdaBoostClassifier

#the base model is stump - decision tree with maximal depth of 1.
adaboost_model = AdaBoostClassifier(n_estimators=50, learning_rate=20)
adaboost_model.fit(X_train, y_train)

In [None]:
print_accuracy(adaboost_model, *splitted_data)

Let's see if Cross validation can help it:

In [None]:
estimators = [10, 25, 50]
lr = [0.01, 0.1, 1]
accuracy_per_value = []

for n_estimator in estimators:
    for rate in lr:
        adaboost_model = AdaBoostClassifier(n_estimators=n_estimator, learning_rate= rate)
        cv_result = cross_validate(adaboost_model, X_train, y_train, cv=3, scoring= ('accuracy'))
        accuracy_per_value.append(cv_result['test_score'].mean())

optim_idx = np.argmax(accuracy_per_value)
adaboost_model = AdaBoostClassifier(n_estimators= estimators[optim_idx//len(lr)], learning_rate= lr[optim_idx%len(lr)])
adaboost_model.fit(X_train, y_train)

In [None]:
print_accuracy(adaboost_model, *splitted_data)

### MLP model:

Let's check how many features we have:

In [None]:
input_dim = X_train.shape[1]
output_dim = 2
train_samples = X_train.shape[0]

print("input dim:", input_dim)
print("number of train samples:", train_samples)
print("number of test samples:", X_test.shape[0])

In [None]:
import torch
import torch.nn as nn

torch.manual_seed(SEED)

X_train = torch.tensor(X_train, dtype = torch.float32)
y_train = torch.tensor(y_train, dtype = torch.long)

X_test = torch.tensor(X_test, dtype = torch.float32)
y_test = torch.tensor(y_test, dtype = torch.long)


In [None]:
batch_size = 64

num_epochs = 100
num_batches = train_samples//batch_size

#### Simple MLP:

We will start by checking if it is possible to overfit the MLP on small subset of the training set: (in order to see if the implementation of trainer and dataIterator module are good):

In [None]:
mlp_model = nn.Sequential(
    nn.BatchNorm1d(input_dim),
    nn.Linear(input_dim, 64),
    nn.LeakyReLU(),
    nn.Linear(64, 32),
    nn.LeakyReLU(),
    nn.Linear(32, output_dim),
    nn.Softmax(dim=1)
)

print(mlp_model)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer,start_factor=1.0,end_factor=1e-4,total_iters=5)

In [None]:
from src.trainer import trainer

mlp_trainer = trainer(mlp_model,optimizer,loss_fn,num_epochs,batch_size,True)

loss_list, accuracy_list = mlp_trainer.fit(X_train=X_train,y_train=y_train)

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1,2, figsize=(15,5))

axs[0].plot(range(num_epochs), loss_list)
axs[0].set_xlabel('epoch')
axs[0].set_ylabel('loss')

axs[1].plot(range(num_epochs), accuracy_list)
axs[1].set_xlabel('epoch')
axs[1].set_ylabel('accuracy')


In [None]:
y_pred = mlp_model(X_test)
test_accuracy = (torch.argmax(y_pred,dim=1) == y_test).sum().item() / X_test.shape[0]
print(f'Test accuracy: {test_accuracy}')

We can notice a bit of overfitting because while the train accuracy is above above 74%, the test accuracy is just above 60%.

In addition, it seems that the learning rate is not correct because the decrease in the loss is very small, preventing the model to learn well.

We will next do cross validation on few hyperparameters in order to get the better model.

#### Cross validation for MLP:

In [None]:
import torch
import torch.nn as nn
from src.cross_val import setConfigure, crossValidate

In [None]:
from sklearn.model_selection import KFold
from src.trainer import trainer

num_epochs = 100

kf = KFold(2,shuffle=True, random_state=SEED)

configures = setConfigure({'lr':[1e-3,1e-4]})

results = []
for config in configures:
    mlp_model = nn.Sequential(
        nn.BatchNorm1d(input_dim),
        nn.Linear(input_dim, 64),
        nn.LeakyReLU(),
        nn.Linear(64, 32),
        nn.LeakyReLU(),
        nn.Linear(32, output_dim),
        nn.Softmax(dim=1)
    )
    print(f'The configuration is {config}:')
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp_model.parameters(), config['lr'])
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, total_iters=10)

    mlp_trainer = trainer(mlp_model,optimizer,loss_fn,num_epochs,batch_size,True)
    results.append(crossValidate(mlp_trainer, X_train, y_train, kf))

In [None]:
from src.utils.plot_utils import plotCV

plotCV(results, configures)

### Others:

In [None]:
#Save the entire model:
#torch.save(model, "my_model.pickle")
#model = torch.load("my_model.pickle")

#Save only the weights: (recommended)
#torch.save(model.state_dict(), "my_model.pickle")
#model = nn.Sequential(...)
#model.load_state_dict(torch.load("my_model.pickle"))