# Regressions

- Predict feature (including dissimilarity) of track $i+1$ given track $i$;
- Predict feature (including dissimilarity) of track $i+1$ given all tracks before $i+1$;


- Predict position of track $i$ given its features;
- Predict position of a sequence of tracks; 


- Predict first/last songs;


- Given all features of all tracks from album, predict feature of next track

# Algorithms

Regressions:
- Linear, splines, lstm, random forest, nnet

Classification:
- Lstm, random forest, nnet

# Data

z_score

In [3]:
setwd("/home/pasoneto/Documents/github/doc_suomi/code")
source("utils.R")
source("data_cook.R")
cat(howto_data)

`summarise()` regrouping output by 'album_id' (override with `.groups` argument)



How to use datasets
 
call    base()             for real values
call    z_scored()         for normalized
call    min_maxed()        for normalized2
call    upsampled_album()  for binded upsamplped albums
call    list_upsampled()   for list of upsampled albums
call    low_z()            for normalized low level
call    low_raw()          for raw  low level

# Train test split

In [2]:
low = low_raw() %>%
        group_by(album_id) %>%
        mutate(track_number = minmax(track_number))

low = split(low, low$album_id)

ltreino = floor(length(low)* 0.8)

treino = low[1:ltreino]
teste = low[(ltreino+1):length(low)]

paste("Split done correctly:", length(treino)+length(teste) == length(low))

treino = dplyr::bind_rows(treino)
teste  = dplyr::bind_rows(teste)

ERROR: Error: Problem with `mutate()` input `track_number`.
[31m✖[39m argument "min" is missing, with no default
[34mℹ[39m Input `track_number` is `minmax(track_number)`.
[34mℹ[39m The error occurred in group 1: album_id = "007bD7YMU5GUUNzNWKGfoV".


## Predict track_number based on features

In [None]:
model.linear = 
        lm(track_number~danceability   +  energy              +  loudness_overall       +  as.factor(key)     +
                        speechiness    +  acousticness        +  instrumentalness       +  as.factor(mode)    +
                        liveness       +  valence             +  tempo_overall          +  duration_ms        +
                        time_signature +  tempo_continuous    +  tempo_confidence       +  key_confidence     +
                        mode_confidence+  loudness_continuous +  time_signature_confidence, 
           data = treino)

In [None]:
pred = predict(model.linear, teste)
true = teste$track_number

paste("Model: "   , Metrics::rmse(pred, true), sep = "")
paste("Zerorule: ", Metrics::rmse(pred, mean(true)), sep = "")

plot(true[2000:2200], type='line', col = 'blue', xlim = c(0,  200), ylim = c(0, 1), ylab = "")
par(new = TRUE)
plot(pred[2000:2200], type='line', col = 'red',  xlim = c(0,  200), ylim = c(0, 1), ylab = "")
par(new = TRUE)
plot(rep(mean(true), length(true)), type='ls', col = 'black',  xlim = c(0,  200), ylim = c(0, 1), ylab = "")

## Random forest

In [None]:
model.rf =
    randomForest(formula = track_number~
                            danceability   +  energy              +  loudness_overall       +  #as.factor(key)     +
                            speechiness    +  acousticness        +  instrumentalness       +  #as.factor(mode)    +
                            liveness       +  valence             +  tempo_overall          +  duration_ms        +
                            time_signature +  tempo_continuous    +  tempo_confidence       +  key_confidence     +
                            mode_confidence+  loudness_continuous +  time_signature_confidence, 
                 data=treino, ntree = 1,
                 importance=TRUE)

In [None]:
pred = predict(model.rf, teste)
true = teste$track_number

paste("Model: "   , Metrics::rmse(pred, true), sep = "")
paste("Zerorule: ", Metrics::rmse(pred, mean(true)), sep = "")

plot(true[2000:2200], type='line', col = 'blue', xlim = c(0,  200), ylim = c(0, 16), ylab = "")
par(new = TRUE)
plot(round(pred[2000:2200]), type='line', col = 'red',  xlim = c(0,  200), ylim = c(0, 16), ylab = "")
par(new = TRUE)
plot(rep(mean(true), length(true)), type='ls', col = 'black',  xlim = c(0,  200), ylim = c(0, 16), ylab = "")

## Neural net

In [None]:
require(nnet)
treino %<>% 
    select(album_id, track_number, danceability, energy, valence, tempo_continuous, tempo_confidence,  key_confidence, loudness_continuous,  time_signature_confidence)

model.nnet = 
    nnet(track_number~., data = treino, size = 2, rang = 0.1, decay = 5e-4, maxit = 100)

In [None]:
teste %<>% 
    select(album_id, track_number, danceability, energy, valence, tempo_continuous, tempo_confidence,  key_confidence, loudness_continuous,  time_signature_confidence)

In [None]:
pred = predict(model.nnet, teste)
true = teste$track_number
paste("Model: "   , Metrics::rmse(pred, true), sep = "")
paste("Zerorule: ", Metrics::rmse(pred, mean(true)), sep = "")

plot(true[0:1000], type='line', col = 'blue', ylim = c(0, 16), ylab = "")
par(new = TRUE)
plot(round(pred[0:1000]), type='line', col = 'red', ylim = c(0, 16), ylab = "")
par(new = TRUE)
plot(rep(mean(true), length(true)), type='ls', col = 'black', ylim = c(0, 16), ylab = "")