In [None]:
import json
import pandas as pd
import numpy as np  
import os
import random

from metric import r_precision, dcg_at_k, ndcg_at_k
from utils import generate_playlists, generate_testcases

from numpy import linalg as LA

import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns
import collections

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
path = '/content/drive/MyDrive/DO_AN_KHDL/DATA'
path_audio_feats = os.path.join(path, 'audio_features.tsv')
path_playlists = os.path.join(path, 'playlists.tsv')
path_tracks = os.path.join(path, 'tracks.tsv')

In [None]:
pd_audio_feats = pd.read_csv(path_audio_feats, sep = '\t')
pd_tracks = pd.read_csv(path_tracks, sep = '\t')
pd_playlists = pd.read_csv(path_playlists, sep = '\t')
pd_playlists = pd_playlists.fillna('None')

In [None]:
pd_tracks = pd.merge(pd_tracks, pd_audio_feats, left_on = 'track_id', right_on = 'id')
pd_full = pd.merge(pd_tracks, pd_playlists, left_on = 'playlist_id', right_on = 'playlist_id')

pd_tracks = pd_tracks.drop('id', 1)
pd_full = pd_full.drop('playlist_name_y', 1)
pd_full = pd_full.drop('id', 1)

In [None]:
pd_full.shape

(86400, 25)

In [None]:
#Thông tin rỗng là discription, ta có thể chỉ điền vào "None"
pd_full = pd_full.fillna('None')

In [None]:
#Kiểm tra và bỏ giá trị trùng lặp
pd_full = pd_full.drop_duplicates()

In [None]:
pd_full.shape

(86350, 25)

## Content Filtering

In [None]:
fraction = 3
track_id_all = generate_playlists(pd_tracks, pd_playlists, nums_playlists_to_test = 120)
track_id_test = generate_testcases(track_id_all, fraction = fraction)

In [None]:
all_playlistID_for_tests = list(track_id_test.keys())

In [None]:
pd_full_tests = pd_full.copy()
pd_full_tests = pd_full_tests[pd_full_tests['playlist_id'].isin(all_playlistID_for_tests)]
pd_full_tests = pd_full_tests.reset_index()
pd_full_tests_copy = pd_full_tests.copy()

In [None]:
omit_feats = ['track_id', 'track_name', 'playlist_id', 'playlist_name_x','artist_ids', 'artist_names', 'album_id', 'album_name', 'analysis_url', 'description', 'num_followers', 'num_tracks']
pd_full_tests_copy = pd_full_tests_copy.drop(omit_feats, 1)

In [None]:
pd_full_tests_copy.shape

(6943, 13)

### Modeling

In [None]:
# Standardizing the data
scaler = MinMaxScaler()
scaler.fit(pd_full_tests_copy)
df_full_scaled = scaler.transform(pd_full_tests_copy)

# We create the cosine similarity matrix of the small scaled dataframe we have 
df_full_cosine = pd.DataFrame(cosine_similarity(df_full_scaled))

In [None]:
df_full_cosine.shape

(6943, 6943)

In [None]:
def generate_radio_content_filtering_cosineMetirc(tracks, cosine_df, info_df, n_pred = 10):
    recs = []
    recs_id = []
    for t in tracks:
      index = info_df.index[info_df['track_id'] == t][0]
      similarities = cosine_df.iloc[:, index].sort_values(ascending=False)
      final_indices = list(similarities[1:(n_pred + 1)].index)
      rec_songs = info_df[['track_name','track_id']].iloc[final_indices]
      recs.extend(rec_songs['track_name'].values)
      recs_id.extend(rec_songs['track_id'].values)

    preds = collections.Counter(recs).most_common(n_pred)
    pred_titles = [p[0] for p in preds]

    preds_id = collections.Counter(recs_id).most_common(n_pred)
    pred_titles_id = [p[0] for p in preds_id]

    return pred_titles, pred_titles_id

**Bây giờ ta sẽ thử test phương pháp này trên playlist đầu tiên của tập test**

In [None]:
playlists_to_test = all_playlistID_for_tests[0]

In [None]:
all_name_songs = list(pd_full[pd_full['track_id'].isin(track_id_all[playlists_to_test])]['track_name'].values)
given_songs_name = list(pd_full[pd_full['track_id'].isin(track_id_test[playlists_to_test])]['track_name'].values)

In [None]:
track_name_result, track_id_result = generate_radio(track_id_test[playlists_to_test], df_full_cosine, pd_full_tests, len(track_id_all[playlists_to_test])-len(track_id_test[playlists_to_test]))

In [None]:
print('Songs given: ', given_songs_name, '\n')
print('Songs recommended: ', track_name_result, '\n')

Songs given:  ['Chicken Fried', 'Roots', 'Chicken Fried', 'Keep Me in Mind', 'All the Best', 'Knee Deep (feat. Jimmy Buffett)'] 

Songs recommended:  ['Runaway', 'Word on a Wing - 1999 Remaster', 'Hypnosis', 'Sandy - From "Grease Live!" Music From The Television Event', 'Oxy Music', 'Penny Lane - Remastered', 'Bad, Bad, Bad', 'San Francisco', 'Stoned Soul Picnic', 'The Pass', 'Tighten Up'] 



In [None]:
track_name_result.extend(given_songs_name)

In [None]:
r_score = r_precision(track_name_result, all_name_songs)
print(f'r_score: {r_score}')

r_score: 0.3125


**Bây giờ ta sẽ sử dụng tất cả 100 playlists trong bộ test để đánh giá**

In [None]:
def test_multi_feats(df_full_cosine, pd_full, track_id_all, track_id_test, all_playlistID_for_tests, fraction = 5):
  r_score = []

  for each_playlist in all_playlistID_for_tests:
    track_name_result, track_id_result = generate_radio_content_filtering_cosineMetirc(track_id_test[playlists_to_test], df_full_cosine, pd_full_tests, len(track_id_all[playlists_to_test])-len(track_id_test[playlists_to_test]))
    
    all_name_songs = list(pd_full[pd_full['track_id'].isin(track_id_all[each_playlist])]['track_name'].values)
    given_songs_name = list(pd_full[pd_full['track_id'].isin(track_id_test[each_playlist])]['track_name'].values) 

    track_name_result.extend(given_songs_name)
    r_score.append(r_precision(track_name_result, all_name_songs))

  print("Average R precision: " + str(np.mean(r_score)))
  print("Max R precision: " + str(np.max(r_score)))

In [None]:
test_multi_feats(df_full_cosine, pd_full_tests, track_id_all, track_id_test, all_playlistID_for_tests, fraction)

Average R precision: 0.32678582941194206
Max R precision: 0.4722222222222222


**Bây giờ ta sẽ cho nhiều giá trị fraction khác nhau để vẽ biểu đồ**

In [None]:
fraction = list(np.arange(1, 10))
fraction

[1, 2, 3, 4, 5, 6, 7, 8, 9]

**Nhận xét về mỗi model với các fraction khác nhau, so sánh 2 models với các fractions khác nhau, sử dụng nhiều features khác nhau (số lượng các features sử dụng, v.v).**