In [1]:
import pandas as pd
import numpy as np
import datetime
from category_encoders import TargetEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

### Load Data

In [2]:
df = pd.read_csv('../spotify-datasets/final_spotify_data_cleaned.csv')
df.shape

(676345, 18)

In [3]:
df.isnull().sum()

popularity                      0
acousticness                    0
danceability                    0
duration_ms                     0
energy                          0
instrumentalness                0
liveness                        0
loudness                        0
speechiness                     0
tempo                           0
valence                         0
total_available_markets         0
release_date                    0
topartist_id                    0
key                             0
mode                            0
time_signature                  0
artist_genre               164869
dtype: int64

### Splitting Data

In [4]:
X = df.drop('popularity', axis=1)
y = df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.25)

### Target Encoding

In [5]:
features_te = ['topartist_id', 'artist_genre']

In [6]:
te = TargetEncoder(cols=features_te, handle_missing='return_nan').fit(X_train, y_train)
X_train = te.transform(X_train)
X_dev = te.transform(X_dev)
X_test = te.transform(X_test)
X_train.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,total_available_markets,release_date,topartist_id,key,mode,time_signature,artist_genre
93445,0.00298,0.478,222840.0,0.963,0.0,0.165,-3.299,0.0643,164.641,0.899,98.0,6208.0,31.25985,11.0,1.0,4.0,29.704464
431267,0.122,0.696,152198.0,0.372,0.0,0.172,-10.236,0.956,77.558,0.413,0.0,673.0,0.029091,1.0,1.0,4.0,
514919,0.897,0.341,180506.0,0.212,0.00124,0.147,-12.131,0.0312,171.267,0.268,181.0,2962.0,26.211667,7.0,1.0,4.0,30.705394
36543,0.309,0.183,144639.0,0.93,0.886,0.236,-13.634,0.0498,73.562,1e-05,184.0,2054.0,30.431532,1.0,1.0,1.0,27.816
637427,0.476,0.584,256720.0,0.498,0.0989,0.121,-8.16,0.0397,81.979,0.257,0.0,3133.0,27.407863,6.0,0.0,4.0,35.419975


In [7]:
imputer = KNNImputer(n_neighbors=5).fit(X_train.sample(frac=0.1))
X_train = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
X_dev = pd.DataFrame(imputer.transform(X_dev), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)

In [8]:
X_train.isnull().sum()

acousticness               0
danceability               0
duration_ms                0
energy                     0
instrumentalness           0
liveness                   0
loudness                   0
speechiness                0
tempo                      0
valence                    0
total_available_markets    0
release_date               0
topartist_id               0
key                        0
mode                       0
time_signature             0
artist_genre               0
dtype: int64

In [9]:
print(X_train.shape, X_dev.shape, X_test.shape)

(405807, 17) (135269, 17) (135269, 17)
