In [1]:
import json
import pandas as pd
import numpy as np  
import os
import random

from metric import r_precision, dcg_at_k, ndcg_at_k
from utils import generate_playlists, generate_testcases

from numpy import linalg as LA

import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns
import collections

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
path = '/content/drive/MyDrive/DO_AN_KHDL/DATA'
path_audio_feats = os.path.join(path, 'audio_features.tsv')
path_playlists = os.path.join(path, 'playlists.tsv')
path_tracks = os.path.join(path, 'tracks.tsv')

In [3]:
pd_audio_feats = pd.read_csv(path_audio_feats, sep = '\t')
pd_tracks = pd.read_csv(path_tracks, sep = '\t')
pd_playlists = pd.read_csv(path_playlists, sep = '\t')
pd_playlists = pd_playlists.fillna('None')

In [4]:
pd_tracks = pd.merge(pd_tracks, pd_audio_feats, left_on = 'track_id', right_on = 'id')
pd_full = pd.merge(pd_tracks, pd_playlists, left_on = 'playlist_id', right_on = 'playlist_id')

pd_tracks = pd_tracks.drop('id', 1)
pd_full = pd_full.drop('playlist_name_y', 1)
pd_full = pd_full.drop('id', 1)

In [5]:
pd_full.shape

(86400, 25)

In [6]:
#Thông tin rỗng là discription, ta có thể chỉ điền vào "None"
pd_full = pd_full.fillna('None')

In [7]:
#Kiểm tra và bỏ giá trị trùng lặp
pd_full = pd_full.drop_duplicates()

In [8]:
pd_full.shape

(86350, 25)

## Baseline model

### Sử dụng feature 'Danceability'

**Ý tưởng tiếp cận**: phương pháp cơ bản đầu tiên đó là ...

Biến nhóm em lựa chọn ở đây là 'Danceability', vì ...


In [40]:
def baseline(tracks, df, n_pred):
    '''
    Input:
    1. tracks = Takes a list of track titles as input (strings)
    2. df = Dataframe containing all track info
    Output:
    recs = Recommends (n_pred) track uri based on info of that string (List of strings)
    '''
    recs_id = []
    for t in tracks:
        danceability = df[df['track_id'] == t]['danceability']
        df['rec_df_dance'] = np.abs(df['danceability'] - danceability.values[0])
        one_recs_id = list(df.sort_values(by=['rec_df_dance'])['track_id'][1:].values)
        recs_id.extend([track for track in one_recs_id if track not in tracks][0:n_pred])

    preds = collections.Counter(recs_id).most_common(n_pred)
    pred_titles = [p[0] for p in preds]

    tracks_name = []
    for id_track in pred_titles:
      tracks_name.extend(list(np.unique(df[df['track_id'] == id_track]['track_name'].values)))

    return pred_titles, tracks_name

In [41]:
fraction = 3
track_id_all = generate_playlists(pd_tracks, pd_playlists)
track_id_test = generate_testcases(track_id_all, fraction = fraction)

In [42]:
all_playlistID_for_tests = list(track_id_test.keys())

In [43]:
pd_full_tests = pd_full.copy()
pd_full_tests = pd_full_tests[pd_full_tests['playlist_id'].isin(all_playlistID_for_tests)]

**Bây giờ ta sẽ thử test phương pháp này trên playlist đầu tiên của tập test**

In [44]:
playlists_to_test = all_playlistID_for_tests[0]

In [46]:
all_name_songs = list(pd_full[pd_full['track_id'].isin(track_id_all[playlists_to_test])]['track_name'].values)
given_songs_name = list(pd_full[pd_full['track_id'].isin(track_id_test[playlists_to_test])]['track_name'].values)

In [47]:
track_id_result, track_name_result = baseline(track_id_test[playlists_to_test], pd_full_tests, len(track_id_all[playlists_to_test])-len(track_id_test[playlists_to_test]))

In [48]:
print('Songs given: ', given_songs_name, '\n')
print('Songs recommended: ', track_name_result, '\n')

Songs given:  ['Raggarbil', 'Save My Life (feat. Lovespeake)', 'Spa', 'Save My Life (feat. Lovespeake)', 'Spa', 'The Chase', 'Tomorrow (feat. 433)', 'Save My Life (feat. Lovespeake)', 'Kraken', 'Tomorrow (feat. 433)', 'Bolt', 'Save My Life (feat. Lovespeake)', 'Spa', 'Bolt', 'Seven Nation Army', 'The Chase', 'Tomorrow (feat. 433)', 'Work That Body', 'Raggarbil', 'Kraken', 'Violin', 'Untz - Zookeepers Remix', 'Who', 'Dum', 'Time', 'What Was in That Glass - Horror Edit', 'Rave i mitt garage', 'Sun on the Ocean', 'Genom eld & vatten', 'Seven Nation Army', 'Seven Nation Army', 'Work That Body'] 

Songs recommended:  ['Adore You', 'No Sleep', 'Finesse - Remix; feat. Cardi B', 'Cool Jerk', 'Highroad', 'Pressure', 'everything i wanted', "Don't Play Fair", "Stayin' Alive - 2007 Remastered Version Saturday Night Fever", 'The Facts In The Case Of M Valdemar', 'Problems', 'Mind Frame Feat. Freddie Foxxx - Instrumental', 'Heureux qui comme Ulysse', "It's Raining Men", 'Kiss Kiss', "Howlin' for You

**Kiểm tra độ chính xác trên testcase đầu tiên**

In [49]:
track_name_result.extend(given_songs_name)

In [50]:
r_score = r_precision(track_name_result, all_name_songs)
print(f'r_score: {r_score}')

r_score: 0.35185185185185186


**Bây giờ ta sẽ sử dụng tất cả 100 playlists trong bộ test để đánh giá**

In [54]:
def test(pd_full, track_id_all, track_id_test, all_playlistID_for_tests, fraction = 5):
  r_score = []

  for each_playlist in all_playlistID_for_tests:
    track_id_result, track_name_result = baseline(track_id_test[each_playlist], pd_full, len(track_id_all[each_playlist])-len(track_id_test[each_playlist]))
    
    all_name_songs = list(pd_full[pd_full['track_id'].isin(track_id_all[each_playlist])]['track_name'].values)
    given_songs_name = list(pd_full[pd_full['track_id'].isin(track_id_test[each_playlist])]['track_name'].values) 

    track_name_result.extend(given_songs_name)
    r_score.append(r_precision(track_name_result, all_name_songs))

  print("Average R precision: " + str(np.mean(r_score)))
  print("Max R precision: " + str(np.max(r_score)))

In [55]:
test(pd_full_tests, track_id_all, track_id_test, all_playlistID_for_tests, fraction)

Average R precision: 0.3383784575359145
Max R precision: 0.4482758620689655


### Sử dụng multi features thay vì một feature.

Ở đây nhóm em quyết định sử dụng các features sau:, vẽ hình phân bố, giải thích domain knowledge.

In [62]:
features = ['loudness', 'acousticness', 'speechiness', 'energy', 'danceability']

In [63]:
def multi_feats_baseline(tracks, df, n_pred, feats):
    recs_id = []
    for t in tracks:
      sum_feat_values = 0
      for feat in feats:
        feat_values = df[df['track_id'] == t][feat]
        sum_feat_values = sum_feat_values + np.abs(df[feat] - feat_values.values[0]) ** len(feats)
      
      df['rec_df_feats'] = sum_feat_values ** (1 / len(feats))
      one_recs_id = list(df.sort_values(by=['rec_df_feats'])['track_id'][1:].values)
      recs_id.extend([track for track in one_recs_id if track not in tracks][0:n_pred])

    preds = collections.Counter(recs_id).most_common(n_pred)
    pred_titles = [p[0] for p in preds]

    tracks_name = []
    for id_track in pred_titles:
      tracks_name.extend(list(np.unique(df[df['track_id'] == id_track]['track_name'].values)))

    return pred_titles, tracks_name

**Bây giờ ta sẽ thử test phương pháp này trên playlist đầu tiên của tập test**

In [64]:
playlists_to_test = all_playlistID_for_tests[0]

In [65]:
all_name_songs = list(pd_full[pd_full['track_id'].isin(track_id_all[playlists_to_test])]['track_name'].values)
given_songs_name = list(pd_full[pd_full['track_id'].isin(track_id_test[playlists_to_test])]['track_name'].values)

In [66]:
track_id_result, track_name_result = multi_feats_baseline(track_id_test[playlists_to_test], pd_full_tests, len(track_id_all[playlists_to_test])-len(track_id_test[playlists_to_test]), features)

In [67]:
print('Songs given: ', given_songs_name, '\n')
print('Songs recommended: ', track_name_result, '\n')

Songs given:  ['Raggarbil', 'Save My Life (feat. Lovespeake)', 'Spa', 'Save My Life (feat. Lovespeake)', 'Spa', 'The Chase', 'Tomorrow (feat. 433)', 'Save My Life (feat. Lovespeake)', 'Kraken', 'Tomorrow (feat. 433)', 'Bolt', 'Save My Life (feat. Lovespeake)', 'Spa', 'Bolt', 'Seven Nation Army', 'The Chase', 'Tomorrow (feat. 433)', 'Work That Body', 'Raggarbil', 'Kraken', 'Violin', 'Untz - Zookeepers Remix', 'Who', 'Dum', 'Time', 'What Was in That Glass - Horror Edit', 'Rave i mitt garage', 'Sun on the Ocean', 'Genom eld & vatten', 'Seven Nation Army', 'Seven Nation Army', 'Work That Body'] 

Songs recommended:  ['The Monster', 'Pour All Night', 'Insomnia - King Arthur Remix', "Hips Don't Lie (feat. Wyclef Jean)", 'Lowkey (feat. Lil Traffic)', "I Don't Care (with Justin Bieber) - Jonas Blue Remix", 'Higher Love', 'Way to the Future', 'Thriller', 'Check On It (feat. Slim Thug)', 'Feel This Moment (feat. Christina Aguilera)', 'Sova när vi dör', "You Can't Hurry Love", 'Lights - Single Ve

**Kiểm tra độ chính xác trên testcase đầu tiên**

In [68]:
track_name_result.extend(given_songs_name)

In [69]:
r_score = r_precision(track_name_result, all_name_songs)
print(f'r_score: {r_score}')

r_score: 0.35185185185185186


**Bây giờ ta sẽ sử dụng tất cả 100 playlists trong bộ test để đánh giá**

In [75]:
def test_multi_feats(pd_full, track_id_all, track_id_test, all_playlistID_for_tests, features, fraction = 5):
  r_score = []

  for each_playlist in all_playlistID_for_tests:
    track_id_result, track_name_result = multi_feats_baseline(track_id_test[each_playlist], pd_full, len(track_id_all[each_playlist])-len(track_id_test[each_playlist]),features)
    
    all_name_songs = list(pd_full[pd_full['track_id'].isin(track_id_all[each_playlist])]['track_name'].values)
    given_songs_name = list(pd_full[pd_full['track_id'].isin(track_id_test[each_playlist])]['track_name'].values) 

    track_name_result.extend(given_songs_name)
    r_score.append(r_precision(track_name_result, all_name_songs))

  print("Average R precision: " + str(np.mean(r_score)))
  print("Max R precision: " + str(np.max(r_score)))

In [76]:
test_multi_feats(pd_full_tests, track_id_all, track_id_test, all_playlistID_for_tests, features, fraction)

Average R precision: 0.35430653668386763
Max R precision: 0.49230769230769234


**Bây giờ ta sẽ cho nhiều giá trị fraction khác nhau để vẽ biểu đồ**

In [79]:
fraction = list(np.arange(1, 10))
fraction

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
for frac in fraction:
  

**Nhận xét về mỗi model với các fraction khác nhau, so sánh 2 models với các fractions khác nhau**