In [123]:
import pandas as pd
import numpy as np
import datetime
from category_encoders import TargetEncoder

# Load Data

In [116]:
df = pd.read_csv('../spotify-datasets/final_spotify_data.csv')
df_raw = df # keep copy of df

In [5]:
df.columns

Index(['Unnamed: 0', 'track_id', 'track_name', 'popularity', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness',
       'loudness', 'speechiness', 'tempo', 'valence',
       'total_available_markets', 'release_date', 'topartist_id',
       'topartist_name', 'artist_names', 'key', 'mode', 'time_signature',
       'artist_genre', 'artist_genre_list'],
      dtype='object')

In [122]:
len(df['artist_genre'].unique())

5067

In [54]:
df['release_date'].isna().sum()

0

### Column information of non numerical columns and processing technique

| Name | Description | Preprocessing |
| :---:| :---------- | :------------ |
| Unnamed: 0 | indices of the each row | dropped |
| track_id | unique encoding of track | dropped |
| track_name | name of the track | dropped (sentiment analysis?) |
| release_date | date track was released | encode as num of days away from 2022-12-31. if only year is available, impute value as year-7-2, impute value as year-month-15 |
| topartist_id | unique encoding of artist | target encoding |
| topartist_name | name of top artist of track | drop (overlap with topartist_id)|
| artist_names | name of all artists of track | drop (overlap with topartist_id)|
| artist_genre | genre of the artist, with 5067 unique values | target encoding |
| artist_genre_list | list of artist's genre | drop | 

In [111]:
features_drop = ['Unnamed: 0', 'track_id', 'track_name', 'track_name', 'topartist_name', 'artist_names', 'artist_genre_list']
features_te = ['topartist_id', 'artist_genre']

### Encode release date

In [114]:
def date2days(date):
    if(date == '0000'):
        return None
    if(len(date.split('-')) == 1):
        date = date + "-7-2"
    if(len(date.split('-')) == 2):
        date = date + "-15"

    year, month, day = date.split('-')
    days = (datetime.datetime(2022, 12, 31) - datetime.datetime(int(year), int(month), int(day))).days
        
    return days

In [117]:
df['release_date'] = df['release_date'].apply(date2days)

### Drop features_drop

In [118]:
df = df.drop(features_drop, axis=1)

In [133]:
df.columns

Index(['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'total_available_markets', 'release_date', 'topartist_id',
       'key', 'mode', 'time_signature', 'artist_genre'],
      dtype='object')

### Target Encoding

In [135]:
te = TargetEncoder(cols=features_te).fit(df, df['popularity'])
df = te.transform(df).head()
df.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,total_available_markets,release_date,topartist_id,key,mode,time_signature,artist_genre
0,100,0.721,0.585,242014.0,0.436,1.3e-05,0.105,-8.761,0.0601,143.874,0.132,1.0,722.0,48.611201,10.0,1.0,4.0,54.474876
1,96,0.0212,0.68,215627.0,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,0.644,184.0,1016.0,59.817724,0.0,1.0,4.0,46.6783
2,96,0.00146,0.514,200040.0,0.73,9.5e-05,0.0897,-5.934,0.0598,171.005,0.334,184.0,1016.0,59.817724,1.0,1.0,4.0,46.6783
3,95,0.221,0.7,140526.0,0.722,0.0,0.272,-3.558,0.0369,90.989,0.756,0.0,890.0,47.025026,7.0,0.0,4.0,35.031905
4,94,0.213,0.662,161385.0,0.413,0.0,0.134,-7.357,0.0299,93.005,0.467,0.0,785.0,58.978145,0.0,1.0,4.0,50.576518


In [136]:
df.to_csv('../spotify-datasets/final_spotify_data_cleaned.csv')