In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from typing import Dict, List, Tuple

In [22]:
TRACK_DIR = "./data/tracks.csv"
ARTIST_DIR= "./data/artists.csv"
AUDIO_DIR = "./data/features.csv"

Đọc dữ liệu từ các file csv đã crawl

In [23]:
df_track_id   = pd.read_csv(TRACK_DIR, index_col=0)
df_feature_id = pd.read_csv(AUDIO_DIR, index_col=0)

df_track   = df_track_id.merge(df_feature_id, left_on="id", right_on="id")
df_artist  = pd.read_csv(ARTIST_DIR, index_col=0)

In [24]:
df_track = df_track.dropna(subset=["available_markets"])

Chia dữ liệu thành tập train và test

In [25]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_track, random_state=0, test_size=0.2)
df_test , df_dev = train_test_split(df_test, random_state=0 , test_size=0.5)

In [26]:
df_train.shape, df_dev.shape, df_test.shape

print("Train:", df_train.shape)
print("Dev:  ", df_dev.shape)
print("Test: ", df_test.shape)

Train: (42368, 23)
Dev:   (5296, 23)
Test:  (5296, 23)


### Linear Regression

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [28]:
def preprocess_baseline(df, add_area=True, areas = ["ME", "RS", "XK"], select_numeric = True):    
    a = df
    
    if add_area:
        df_area = df[["id", "available_markets"]].copy()
        for area in areas:
            df_area[f"in_{area}"] = df_area["available_markets"].apply(lambda x : area in x)

        a = df.merge(df_area, left_on="id", right_on="id")
    
    if select_numeric:
        return a.select_dtypes(["int64", "float64", "bool"])
    else:
        return a

### Không thêm dữ liệu available markets

In [29]:
df_train_preproc = preprocess_baseline(df_train, False)
df_dev_preproc = preprocess_baseline(df_dev,   False)
x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [30]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])

model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  275.6330598388134 MAE 13.366453563236952
Dev: MSE  277.89169591526473 MAE 13.415547748546809


### Thêm dữ liệu available markets

##### Thêm 3 available markets ảnh hưởng nhất (có tương quan cao nhất với popularity), phân tích ở notebook analysis.ipynp

In [31]:
df_train_preproc = preprocess_baseline(df_train, True)
df_dev_preproc = preprocess_baseline(df_dev,   True)
x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [32]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])

model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  266.7581767778346 MAE 13.143501013661496
Dev: MSE  270.18054338871343 MAE 13.210574115800439


##### Thêm 5 available markets

In [33]:
df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"])
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [34]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])

model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  264.32058364644774 MAE 13.090310808814577
Dev: MSE  268.4174594754088 MAE 13.20629684826219


### Thêm giá trị followers
Thêm giá trị followers lớn nhất của các artist trong bài hát

In [35]:
def add_max_followers(df, fill_na_value=0):
    b = df[["id", "artists"]].copy()
    b["artists"] = b["artists"].apply(lambda x : x.split(","))
    
    #Tạo ra các dòng gồm 1 artist id
    b = b.explode("artists")
    
    #join hai bảng dựa vào artists id
    df_track_follower = b.merge(df_artist[["id", "followers"]], left_on="artists", right_on="id", how="left")
    #Bỏ cột thừa sau khi merge
    df_track_follower = df_track_follower.drop(columns="id_y")
    df_track_follower.columns = ["id_track", "id_artists", "followers"]
    
    #group theo id track là lấy max của followers các dòng cùng id track
    track_follower_max  = df_track_follower.groupby("id_track")["followers"].max()
    #Các artist không có followers thay bằng giá trị
    track_follower_max  = track_follower_max.fillna(fill_na_value)
    
    df_tmp = df.merge(track_follower_max, left_on="id", right_index=True)
    return df_tmp

In [36]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"], False)
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"], False)

df_train_preproc = add_max_followers(df_train_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])
df_dev_preproc   = add_max_followers(df_dev_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [37]:
model_baseline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])
model_baseline.fit(x_train.values, y_train.values)

y_train_pred = model_baseline.predict(x_train.values)
y_dev_pred   = model_baseline.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  241.41868288815206 MAE 12.438236679375619
Dev: MSE  247.9952107395564 MAE 12.60718226266387


### Neural network

In [38]:
from sklearn.neural_network import MLPRegressor

In [39]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"], False)
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"], False)

df_train_preproc = add_max_followers(df_train_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])
df_dev_preproc   = add_max_followers(df_dev_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [40]:
model_nn = Pipeline([('scaler', StandardScaler()), ('nn', MLPRegressor(learning_rate_init=0.01))])
model_nn.fit(x_train.values, y_train.values)

y_train_pred = model_nn.predict(x_train.values)
y_dev_pred   = model_nn.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  153.35221544969014 MAE 9.634180144202636
Dev: MSE  168.8989156666564 MAE 10.163242807888718




### Random Forest

In [41]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_train, True, ["ME", "RS", "XK", "US", "JP"], False)
df_dev_preproc   = preprocess_baseline(df_dev,   True, ["ME", "RS", "XK", "US", "JP"], False)

df_train_preproc = add_max_followers(df_train_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])
df_dev_preproc   = add_max_followers(df_dev_preproc, follower_median).select_dtypes(["int64", "float64", "bool"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]
x_dev,   y_dev   = df_dev_preproc.drop(columns="popularity")  , df_dev_preproc["popularity"]

In [43]:
model_rf = Pipeline([('scaler', StandardScaler()), ('nn', RandomForestRegressor())])
model_rf.fit(x_train.values, y_train.values)

y_train_pred = model_rf.predict(x_train.values)
y_dev_pred   = model_rf.predict(x_dev.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Dev: MSE ", mean_squared_error(y_dev.values, y_dev_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))



Train: MSE  30.247423877326764 MAE 3.941899861530715
Dev: MSE  162.8333668743706 MAE 9.798257804632428


### Tổng hợp kết quả của các mô hình

|Model|train_MSE|train_MAE|dev_MSE|dev_MAE|
|---|---|---|---|---|
|Linear Regression|275.6331|13.3665|277.8917|13.4155|
|Linear Regression + add_area3|266.7582|13.1435|270.1805|13.2106|
|Linear Regression + add_area5|264.3206|13.0903|268.4175|13.2063|
|Linear Regression + add_area5 + max_followers|241.4187|12.4382|247.9952|12.6072|
|Neural Network + add_area5 + max_followers|155.4821|9.6797|169.1587|10.1476|
|Random Forest + add_area5 + max_followers|21.4945|3.5134|147.8407| 9.3715|


Như vậy có thể thấy, mô hình Random Forest + tiền xử lý cho kết quả tốt nhất

### Huấn luyện mô hình với toàn bộ tập train (Dev + Train)

In [44]:
df_train_all = pd.concat([df_train, df_dev], axis=0)
df_train_all.shape

(47664, 23)

In [45]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_train_all, True, ["ME", "RS", "XK", "US", "JP"], False)
df_train_preproc = add_max_followers(df_train_all, follower_median).select_dtypes(["int64", "float64", "bool"])

x_train, y_train = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]

In [46]:
model_rf = Pipeline([('scaler', StandardScaler()), ('nn', RandomForestRegressor())])
model_rf.fit(x_train.values, y_train.values)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('nn',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=10, n_jobs=None,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False))],
         verbose=False)

### Save model

In [47]:
import joblib
joblib.dump(model_rf, "final_model.joblib")

['final_model.joblib']

### Chạy trên tập test

In [48]:
final_model = joblib.load("final_model.joblib")

In [49]:
follower_median = df_artist.followers.median()

df_train_preproc = preprocess_baseline(df_test, True, ["ME", "RS", "XK", "US", "JP"], False)
df_train_preproc = add_max_followers(df_test, follower_median).select_dtypes(["int64", "float64", "bool"])

x_test, y_test = df_train_preproc.drop(columns="popularity"), df_train_preproc["popularity"]

In [50]:
y_train_pred = final_model.predict(x_train.values)
y_test_pred   = final_model.predict(x_test.values)

print("Train: MSE ", mean_squared_error(y_train.values, y_train_pred) , "MAE", mean_absolute_error(y_train.values, y_train_pred))
print("Test: MSE ", mean_squared_error(y_test.values, y_test_pred) , "MAE", mean_absolute_error(y_dev.values, y_dev_pred))

Train: MSE  33.827692060216975 MAE 4.133186941527198
Test: MSE  180.11628560253405 MAE 9.798257804632428
