In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
from torch.optim import Adam
from IPython.display import display
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

In [5]:
songs = pd.read_csv('kkbox/songs.csv')
songs_extra = pd.read_csv('kkbox/song_extra_info.csv')
songs = songs.merge(songs_extra, on='song_id', how='left')
songs = songs[['artist_name', 'name']]
songs = songs.loc[~songs['artist_name'].isna()]
songs = songs.loc[~songs['name'].isna()]

songs['artist_count'] = songs.groupby(['name'])['artist_name'].transform('count')
songs = songs.loc[songs['artist_count'] > 1]
songs = songs.reset_index()
songs.drop(columns=['artist_count', 'index'], inplace=True)
songs.drop_duplicates(inplace=True)

In [6]:
songs

Unnamed: 0,artist_name,name
0,張信哲 (Jeff Chang),焚情
1,S.H.E,愛我的資格
2,貴族精選,Mary Had a Little Lamb
3,貴族精選,となりのトトロ
4,伍佰 & China Blue,夢醒時分
...,...,...
1376200,Philip Selway,What Goes Around
1376201,Bobby Gasparakis| Edo Mela| Bodega| Randy Garc...,U Got 2 Know
1376202,Orgel Carrol 오르골캐롤,three bears 곰세마리
1376203,The Jackals,Unity


In [7]:
def setdiff2d(a1, a2):
    a1_rows = a1.view([('', a1.dtype)] * a1.shape[1])
    a2_rows = a2.view([('', a2.dtype)] * a2.shape[1])
    return np.setdiff1d(a1_rows, a2_rows).view(a1.dtype).reshape(-1, a1.shape[1])

class SongsDataset:
    def __init__(self, data):
        self.data = data
        self.num_song = None
        self.num_artist = None
        self.art2idx = None
        self.idx2art = None
        self.song2idx = None
        self.idx2song = None
        self.create_mappings()
        
    def create_mappings(self):
        unique_artists = self.data['artist_name'].unique()
        self.num_artist = len(unique_artists)
        self.art2idx = dict(zip(unique_artists, range(self.num_artist)))
        self.idx2art = dict(zip(range(self.num_artist), unique_artists))

        unique_songs = self.data['name'].unique()
        self.num_song = len(unique_songs)
        self.song2idx = dict(zip(unique_songs, range(self.num_song)))
        self.idx2song = dict(zip(range(self.num_song), unique_songs))
        
    def create_dataset(self, neg_rate=1):
        artists = self.data['artist_name'].apply(lambda x: self.art2idx[x]).to_numpy()
        songs = self.data['name'].apply(lambda x: self.song2idx[x]).to_numpy()
        X = np.stack((artists, songs), axis=1)
        y = np.ones((X.shape[0], 1))
        
        for _ in range(neg_rate):
            np.random.shuffle(artists)
            np.random.shuffle(songs)
            X_neg = np.stack((artists, songs), axis=1)
            X_neg = setdiff2d(X_neg, X)
            y_neg = np.zeros((X_neg.shape[0], 1))
            
            X = np.vstack((X, X_neg))
            y = np.vstack((y, y_neg))
        return torch.LongTensor(X), torch.Tensor(y).squeeze()

In [8]:
class Embedder(nn.Module):
    def __init__(self, num_artist, num_song, emb_dim=50):
        super().__init__()
        self.Embedding_art = nn.Embedding(num_artist, emb_dim)
        self.Embedding_song = nn.Embedding(num_song, emb_dim)
        self.dist = nn.CosineSimilarity()
        self.output = nn.Sigmoid()
        
    def forward(self, x):
        art_idx = x[:, 0]
        song_idx = x[:, 1]
        
        art_embs = self.Embedding_art(art_idx)
        song_embs = self.Embedding_song(song_idx)
        
        scores = self.dist(art_embs, song_embs)
        return self.output(scores)    

In [9]:
sd = SongsDataset(songs)
X, y = sd.create_dataset()
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

emb = Embedder(sd.num_artist, sd.num_song)
emb.load_state_dict(torch.load('emb.pkl'))
# loss_fn = nn.BCELoss()
# optim = Adam(emb.parameters())
# epochs = 10

# for i in tqdm(range(epochs)):
#     avg_loss = 0
#     steps = 0
#     for x, y_true in tqdm(dataloader):
#         y_pred = emb(x)
#         loss = loss_fn(y_pred, y_true)
#         optim.zero_grad()
#         loss.backward()
#         optim.step()
#         avg_loss += loss.item()
#         steps += 1
#     print(f'Epoch {i + 1} / {epochs}. Loss: {avg_loss/steps}')

<All keys matched successfully>

In [8]:
# torch.save(emb.state_dict(), 'emb.pkl')

## a) similars

In [10]:
songs['artist_idx'] = songs['artist_name'].apply(lambda x: sd.art2idx[x])
songs['song_idx'] = songs['name'].apply(lambda x: sd.song2idx[x])

artist_emb = emb.Embedding_art(torch.LongTensor(songs['artist_idx'].to_numpy())).detach()
song_emb = emb.Embedding_song(torch.LongTensor(songs['song_idx'].to_numpy())).detach()
artist_song_emb = np.hstack((artist_emb, song_emb))
songs.drop(columns=['artist_idx', 'song_idx'], inplace=True)

In [51]:
def find_similars(x_emb, embs, k=10):    
    dist_fn = nn.CosineSimilarity()
    dist = dist_fn(x_emb, embs).detach()
    sorted_idx = np.argsort(dist)

    top_k = sorted_idx[-k:-1].squeeze().tolist()
    return pd.DataFrame(songs.iloc[top_k])

In [61]:
for _ in range(5):
    x = songs.sample()
    print('For')
    display(x)
    print('Similars are:')
    example_emb = torch.Tensor(artist_song_emb[x.index])
    similars = find_similars(example_emb, torch.Tensor(artist_song_emb))
    display(similars)
    print('------------------')

For


Unnamed: 0,artist_name,name
525817,代小波,人賤人愛


Similars are:


Unnamed: 0,artist_name,name
560536,代小波,分手在天亮以後
185696,代小波,為你
47304,代小波,我不要
233973,代小波,你是我今生最愛的人
129636,內地華語精選,人賤人愛
422572,代小波,打死也不信
663953,代小波,如果愛能早些說出來
758832,代小波,只要你能開心
743196,代小波,過客


------------------
For


Unnamed: 0,artist_name,name
382358,Teenage dirtbag,Swing| Swing


Similars are:


Unnamed: 0,artist_name,name
518088,Teenage dirtbag,Walk Idiot Walk
234931,Teenage dirtbag,Rollin' (Air Raid Vehicle)
73053,Teenage dirtbag,The Bad Touch
260749,Teenage dirtbag,Kiss Me
747973,Teenage dirtbag,What I Got
905811,Teenage dirtbag,Buddy Holly
287608,Teenage dirtbag,What's My Age Again?
385635,Teenage dirtbag,Smooth Criminal
653522,Teenage dirtbag,Malibu


------------------
For


Unnamed: 0,artist_name,name
689741,Blue Claw Jazz,Piano Trio (Instrumental)


Similars are:


Unnamed: 0,artist_name,name
599914,Jazz Trio,Thumbs Up
277991,Tenth Avenue North feat. Britt Nicole,We Three Kings
130355,T. Mills,Loud
655619,Christian Christmas Music,The First Noel
460894,Ron Carter,Softly as in a Morning Sunrise
174756,Piano Christmas,We Three Kings
717456,Rob Johnson,We Three Kings
669673,Amille,Small Hours
902040,Sugo Music Artists,We Three Kings


------------------
For


Unnamed: 0,artist_name,name
700343,Van Cliburn,Allegro con fuoco


Similars are:


Unnamed: 0,artist_name,name
596869,Van Cliburn,Adagio sostenuto
683302,Van Cliburn,I. Allegro molto moderato
520640,Van Cliburn,Moderato; Allegro
707641,Van Cliburn,III. Rondo. Vivace
916630,Van Cliburn,I. Allegro maestoso - Tempo giusto
800135,Van Cliburn,III. Allegro marziale animato
760418,Van Cliburn,II. Andante con moto
630443,Van Cliburn,Finale: Alla breve
767113,Van Cliburn,Variation VII: Meno mosso| a tempo moderato


------------------
For


Unnamed: 0,artist_name,name
48116,Sum 41,The Bitter End


Similars are:


Unnamed: 0,artist_name,name
26124,Sum 41,Over My Head (Better Off Dead)
96685,Sum 41,So Long Goodbye
104352,Falling In Reverse,The Bitter End
142354,Sum 41,Handle This
90731,Sum 41,Some Say
8541,Sum 41,Fat Lip
206614,Sum 41,Makes No Difference
531504,Richie Kotzen,The Bitter End
914270,Sam Hulick,The Bitter End


------------------


## b) recommender

In [9]:
data = pd.read_csv('kkbox/train.csv')
songs = pd.read_csv('kkbox/songs.csv')
songs_extra = pd.read_csv('kkbox/song_extra_info.csv')
songs = songs.merge(songs_extra, on='song_id', how='left')
songs = songs[['song_id', 'artist_name', 'name']]

data = data.merge(songs, on='song_id', how='left')
data = data.loc[data['artist_name'].isin(sd.art2idx)]
data = data.loc[data['name'].isin(sd.song2idx)]
data = data.loc[~data['artist_name'].isna()]
data = data.loc[~data['name'].isna()]
data = data.reset_index()
data.drop(columns=['source_system_tab', 'source_screen_name', 'source_type', 'song_id', 'index'], inplace=True)

data['artist_idx'] = data['artist_name'].apply(lambda x: sd.art2idx[x])
data['song_idx'] = data['name'].apply(lambda x: sd.song2idx[x])

In [20]:
class Users:
    def __init__(self, data, emb_size=50):
        unique_users = np.unique(data['msno'])
        self.n_users = len(unique_users)
        self.user2idx = dict(zip(unique_users, range(self.n_users)))
        self.idx2user = dict(zip(range(self.n_users), unique_users))
        data['msno'] = data['msno'].apply(lambda x: self.user2idx[x])
        self.data = data
        self.emb_size = emb_size
        self.embs = None
        self.compute_embs()
        
    def compute_embs(self):
        user_artist = np.zeros((self.n_users, self.emb_size))
        user_song = np.zeros((self.n_users, self.emb_size))
        
        for i in self.idx2user:
            user_art_idx = torch.LongTensor(self.data.loc[self.data['msno'] == i, 'artist_idx'].values)
            user_song_idx = torch.LongTensor(self.data.loc[self.data['msno'] == i, 'song_idx'].values)
            user_art_emb = emb.Embedding_art(user_art_idx)
            user_song_emb = emb.Embedding_song(user_song_idx)
            user_art_total = torch.sum(user_art_emb, dim=0).detach()
            user_song_total = torch.sum(user_song_emb, dim=0).detach()
            user_artist[i] = user_art_total
            user_song[i] = user_song_total
        self.embs = np.hstack((user_artist, user_song))

In [21]:
user = Users(data)

In [25]:
class Recommender(nn.Module):
    def __init__(self, user_emb, art_emb, song_emb, hidden_size=64, emb_size=50):
        super().__init__()
        self.Emedding_user = nn.Embedding.from_pretrained(torch.Tensor(user_emb))
        self.Embedding_art = nn.Embedding.from_pretrained(art_emb.weight)
        self.Embedding_song = nn.Embedding.from_pretrained(torch.Tensor(song_emb.weight))
        
        self.fc1 = nn.Linear(emb_size * 3, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, 1)
        self.Sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        user_idx, art_idx, song_idx = x
        user_emb = self.Emedding_user(user_idx)
        art_emb = self.Embedding_art(art_idx)
        song_emb = self.Embedding_song(song_idx)
        emb = np.hstack((user_emb, art_emb, song_emb))
        
        x = self.fc1(emb)
        x = self.relu(x)
        x = self.fc2(x)
        return self.Sigmoid(x)

In [26]:
X = torch.LongTensor(data[['msno', 'artist_idx', 'song_idx']].to_numpy())
y = torch.Tensor(data['target'].to_numpy()).squeeze()

In [27]:
X = torch.LongTensor(data[['msno', 'artist_idx', 'song_idx']].to_numpy())
y = torch.Tensor(data['target'].to_numpy()).squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=1024, shuffle=True)

rec = Recommender(user.embs, emb.Embedding_art, emb.Embedding_song)
optim = Adam(rec.parameters())
loss_fn = nn.BCELoss()
epochs = 10

for i in tqdm(range(epochs)):
    avg_loss = 0
    steps = 0
    for x, y_true in tqdm(train_dataloader):
        y_pred = emb(x)
        loss = loss_fn(y_pred, y_true)
        optim.zero_grad()
        loss.backward()
        optim.step()
        avg_loss += loss.item()
        steps += 1
    print(f'Epoch {i + 1} / {epochs}. Loss: {avg_loss/steps}')

for x, y_true in tqdm(test_dataloader):
    avg_auc = 0
    steps = 0
    with torch.no_grad():
        y_pred = emb(x)
        avg_auc += roc_auc_score(y_true, y_pred)
        steps += 1
test_auc = avg_auc/steps
print(f'Test auc: {test_auc}')

# kf = KFold(n_splits=5, shuffle=True)
# test_aucs = []
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
# test_aucs.append(test_auc)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 1 / 10. Loss: 0.6956675831407301


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 2 / 10. Loss: 0.69566692707868


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 3 / 10. Loss: 0.6956685034958762


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 4 / 10. Loss: 0.6956676002744969


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 5 / 10. Loss: 0.6956681146352369


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 6 / 10. Loss: 0.6956675589416552


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 7 / 10. Loss: 0.6956675963862067


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 8 / 10. Loss: 0.6956678812745963


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 9 / 10. Loss: 0.6956673219772335


HBox(children=(FloatProgress(value=0.0, max=3771.0), HTML(value='')))


Epoch 10 / 10. Loss: 0.695667402777802



HBox(children=(FloatProgress(value=0.0, max=943.0), HTML(value='')))


Test auc: 0.4862178472402115


###    ¯\_(ツ)_/¯