# Dataset Spotify

Se importa el dataset con la información de las canciones

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("data.csv", index_col=0)
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1,Mask Off,Future
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,1,Redbone,Childish Gambino
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,1,Xanny Family,Future
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,1,Master Of None,Beach House
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,1,Parallel Lines,Junior Boys


## Seleccion de los datos
Del dataset original, se selecciona los datos numericos para analizar con la red neuronal.

In [7]:
# La variable X corresponde a los datos sobre las canciones con las que se quiere predecir
X = df.drop(labels=['song_title', 'artist', 'target'], axis=1).values.tolist()

# La variable y corresponde al gusto del usuario por la canción, se espera predecir este valor.
y = df['target'].values.tolist()

## División de datos
A continuación se presenta la funcion encargada de dividir los datos para entrenamiento (70%) y pruebas (30%).

In [16]:
import random

# Funcion recibe los datos de entrada (X,y) y separa de manera aleatoria en datos de entrenamiento y de prueba. 
def train_test_split(X, y, test_size=0.3):
    # Inicialización de listas
    X_train, X_test, y_train, y_test = (list(),list(),list(),list())
    # Se define tamaño de entrenamiento (70%)
    size_train = round(len(X)*(1 - test_size))
    # Se genera una lista con los indices del 70% de datos aleatorios.
    random_range = random.sample(range(len(X)), size_train)

    for index in range(len(X)):
        if index in random_range:
            X_train.append( X[index] )
            y_train.append( y[index] )
        else:
            X_test.append( X[index] )
            y_test.append( y[index] )
    
    return (X_train, X_test, y_train, y_test)

### Formato de salida
Se realiza una prueba para comprobar que los datos esten siendo asignados correctamente

In [25]:
# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print('Datos de prueba')
for i in range(5): print(f'y: {y_test[i]}\t X: {X_test[i]}' )
print('Datos de entrenamiento')
for i in range(5): print(f'y: {y_train[i]}\t X: {X_train[i]}' )

Datos de prueba
y: 1	 X: [0.0344, 0.838, 185707.0, 0.412, 0.000234, 2.0, 0.159, -7.148, 1.0, 0.289, 75.044, 4.0, 0.173]
y: 1	 X: [0.00208, 0.836, 226840.0, 0.603, 0.0, 7.0, 0.571, -7.792, 1.0, 0.237, 99.994, 4.0, 0.386]
y: 1	 X: [0.0572, 0.525, 358187.0, 0.855, 0.0143, 5.0, 0.649, -7.372, 0.0, 0.0548, 111.951, 3.0, 0.524]
y: 1	 X: [0.0239, 0.603, 270827.0, 0.955, 0.0451, 1.0, 0.119, -4.111, 1.0, 0.0458, 123.922, 4.0, 0.773]
y: 1	 X: [0.314, 0.713, 195429.0, 0.611, 0.0, 1.0, 0.117, -6.702, 0.0, 0.241, 140.061, 4.0, 0.783]
Datos de entrenamiento
y: 1	 X: [0.0102, 0.833, 204600.0, 0.434, 0.0219, 2.0, 0.165, -8.795, 1.0, 0.431, 150.062, 4.0, 0.286]
y: 1	 X: [0.199, 0.743, 326933.0, 0.359, 0.00611, 1.0, 0.137, -10.401, 1.0, 0.0794, 160.083, 4.0, 0.588]
y: 1	 X: [0.604, 0.494, 199413.0, 0.338, 0.51, 5.0, 0.0922, -15.236, 1.0, 0.0261, 86.468, 4.0, 0.23]
y: 1	 X: [0.18, 0.678, 392893.0, 0.561, 0.512, 5.0, 0.439, -11.648, 0.0, 0.0694, 174.004, 4.0, 0.904]
y: 1	 X: [0.00479, 0.804, 251333.0, 0.5