In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from typing import Dict, List, Tuple

In [36]:
TRACK_DIR = "./data/tracks.csv"
ARTIST_DIR= "./data/artists.csv"
AUDIO_DIR = "./data/features.csv"

Đọc dữ liệu từ các file csv đã crawl

In [37]:
df_track_id   = pd.read_csv(TRACK_DIR, index_col=0)
df_feature_id = pd.read_csv(AUDIO_DIR, index_col=0)

df_track   = df_track_id.merge(df_feature_id, left_on="id", right_on="id")
df_artist  = pd.read_csv(ARTIST_DIR, index_col=0)

In [38]:
df_track = df_track.dropna(subset=["available_markets"])

Chia dữ liệu thành tập train và test

In [39]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_track, random_state=0, test_size=0.2)
df_test , df_dev = train_test_split(df_test, random_state=0 , test_size=0.5)

In [40]:
df_train.shape, df_dev.shape, df_test.shape

print("Train:", df_train.shape)
print("Dev:  ", df_dev.shape)
print("Test: ", df_test.shape)

Train: (42368, 23)
Dev:   (5296, 23)
Test:  (5296, 23)


### Linear Regression

In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [144]:
def preprocess_baseline(df, add_area=True, areas = ["ME", "RS", "XK"], select_numeric = True):    
    a = df
    
    if add_area:
        df_area = df[["id", "available_markets"]].copy()
        for area in areas:
            df_area[f"in_{area}"] = df_area["available_markets"].apply(lambda x : area in x)

        a = df.merge(df_area, left_on="id", right_on="id")
    
    if select_numeric:
        return a.select_dtypes(["int64", "float64", "bool"])
    else:
        return a

### Không thêm dữ liệu available markets

In [145]:
df_train_preproc = preprocess_baseline(df_train, False)
df_dev_preproc = preprocess_baseline(df_dev,   False)
x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [146]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])

model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  275.6330598388134 MAE 13.366453563236952
Dev: MSE  277.89169591526473 MAE 13.415547748546809


### Thêm dữ liệu available markets

##### Thêm 3 available markets ảnh hưởng nhất

In [147]:
df_train_preproc = preprocess_baseline(df_train, True)
df_dev_preproc = preprocess_baseline(df_dev,   True)
x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [148]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])

model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  266.7581767778346 MAE 13.143501013661496
Dev: MSE  270.18054338871343 MAE 13.210574115800439


##### Thêm 5 available markets

In [149]:
df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"])
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [150]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])

model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  264.32058364644774 MAE 13.090310808814577
Dev: MSE  268.4174594754088 MAE 13.20629684826219


### Thêm giá trị followers
Thêm giá trị followers lớn nhất của các artist trong bài hát

In [163]:
def add_max_followers(df, fill_na_value=0):
    b = df[["id", "artists"]].copy()
    b["artists"] = b["artists"].apply(lambda x : x.split(","))
    
    #Tạo ra các dòng gồm 1 artist id
    b = b.explode("artists")
    
    #join hai bảng dựa vào artists id
    df_track_follower = b.merge(df_artist[["id", "followers"]], left_on="artists", right_on="id", how="left")
    #Bỏ cột thừa sau khi merge
    df_track_follower = df_track_follower.drop(columns="id_y")
    df_track_follower.columns = ["id_track", "id_artists", "followers"]
    
    #group theo id track là lấy max của followers các dòng cùng id track
    track_follower_max  = df_track_follower.groupby("id_track")["followers"].max()
    #Các artist không có followers thay bằng giá trị
    track_follower_max  = track_follower_max.fillna(fill_na_value)
    
    df_tmp = df.merge(track_follower_max, left_on="id", right_index=True)
    return df_tmp

In [167]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"], False)
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"], False)

df_train_preproc = add_max_followers(df_train_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])
df_dev_preproc   = add_max_followers(df_dev_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [168]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])
model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  241.41868288815206 MAE 12.438236679375619
Dev: MSE  247.9952107395564 MAE 12.60718226266387


### Neural network

In [170]:
from sklearn.neural_network import MLPRegressor

In [171]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"], False)
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"], False)

df_train_preproc = add_max_followers(df_train_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])
df_dev_preproc   = add_max_followers(df_dev_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [174]:
model_nn = Pipeline([('scaler', StandardScaler()), ('nn', MLPRegressor(learning_rate_init=0.01))])
model_nn.fit(x_train.values, y_train.values)

y_train_pred = model_nn.predict(x_train.values)
y_dev_pred   = model_nn.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  155.48206238678844 MAE 9.679703769966572
Dev: MSE  169.15873599587567 MAE 10.14760944919278


### Random Forest

In [176]:
from sklearn.ensemble import RandomForestRegressor

In [177]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"], False)
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"], False)

df_train_preproc = add_max_followers(df_train_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])
df_dev_preproc   = add_max_followers(df_dev_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [184]:
model_rf = Pipeline([('scaler', StandardScaler()), ('nn', RandomForestRegressor())])
model_rf.fit(x_train.values, y_train.values)

y_train_pred = model_rf.predict(x_train.values)
y_dev_pred   = model_rf.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  21.49454585411147 MAE 3.5134049644011056
Dev: MSE  147.84066347009747 MAE 9.371500577704648


### Tổng hợp kết quả của các mô hình

|Model|train_MSE|train_MAE|dev_MSE|dev_MAE|
|---|---|---|---|---|
|Linear Regression|275.6331|13.3665|277.8917|13.4155|
|Linear Regression + add_area3|266.7582|13.1435|270.1805|13.2106|
|Linear Regression + add_area5|264.3206|13.0903|268.4175|13.2063|
|Linear Regression + add_area5 + max_followers|241.4187|12.4382|247.9952|12.6072|
|Neural Network + add_area5 + max_followers|155.4821|9.6797|169.1587|10.1476|
|Random Forest + add_area5 + max_followers|21.4945|3.5134|147.8407| 9.3715|
