In [1]:
import os
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
import implicit
from implicit.evaluation import precision_at_k
from implicit.nearest_neighbours import CosineRecommender
import logging
from colorama import Fore, Style, init
import warnings
warnings.filterwarnings('ignore')

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format=f'{Fore.GREEN}%(asctime)s - %(levelname)s - %(message)s{Style.RESET_ALL}')

# Initialize colorama
init(autoreset=True)

In [3]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [4]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
last_skywalker = np.array([0.98,0.9,-0.9])
casablanca = np.array([-0.99,-0.3,0.8])

user1 = np.array([0.9,0.8,-0.6])

In [7]:
print(f"calculate the match between this combination user1 on the last skywalker movie is {last_skywalker.dot(user1)}")
print(f"calculate the match between this combination user1 on the casablanca movie is {casablanca.dot(user1)}")

calculate the match between this combination user1 on the last skywalker movie is 2.142
calculate the match between this combination user1 on the casablanca movie is -1.611


In [10]:
def load_data(triplet_path, unique_tracks_path):
    logging.info('Loading data...')


    triplet_columns = ['user_id', 'song_id', 'play_count']
    track_columns = ['track_id', 'song_id', 'artist', 'title']

    triplet_df = pl.read_csv(triplet_path, separator='\t', new_columns=triplet_columns, use_pyarrow=True)
    unique_tracks_df = pl.read_csv(unique_tracks_path, new_columns=track_columns, use_pyarrow=True)

    logging.info('Data loaded successfully.')

    logging.info('Merging songs...')

    triplet_df = triplet_df.filter(pl.col('play_count') > 1)
    songs = pd.merge(triplet_df.to_pandas(), unique_tracks_df.to_pandas(), on='song_id', how='left')
    songs['song'] = songs['title']+' - ' + songs['artist']
    songs = songs[['user_id', 'song_id', 'track_id', 'song', 'play_count']]

    songs['user_idx'] = pd.factorize(songs['user_id'])[0]
    songs['song_idx'] = pd.factorize(songs['song_id'])[0]

    logging.info('Songs merged successfully.')

    del triplet_df, unique_tracks_df

    # save the data
    # songs.to_csv('data/songs.csv', index=False)

    return songs


# Load data
global_path = './data'
triplet_path = f"{global_path}/train_triplets.txt"
unique_tracks_path = f"{global_path}/p02_unique_tracks.csv"

songs = load_data(triplet_path, unique_tracks_path)


# Splitting the data into training and testing sets
X = songs[['user_idx', 'song_idx', 'play_count', 'song']]
# train_data, test_data = train_test_split(X, test_size=0.2, random_state=42)
# train_data.set_index(["user_idx", "song_idx"], inplace=True)
# test_data.set_index(["user_idx", "song_idx"], inplace=True)

[32m2024-05-15 20:14:52,755 - INFO - Loading data...[0m
[32m2024-05-15 20:15:05,513 - INFO - Data loaded successfully.[0m
[32m2024-05-15 20:15:05,514 - INFO - Merging songs...[0m
[32m2024-05-15 20:15:26,010 - INFO - Songs merged successfully.[0m


In [11]:
X

Unnamed: 0,user_idx,song_idx,play_count,song
0,0,0,2,Entre Dos Aguas - Paco De Lucia
1,0,1,2,12 segundos de oscuridad - Jorge Drexler
2,0,2,5,Apuesta Por El Rock 'N' Roll - Héroes del Silencio
3,0,3,5,I'll Be Missing You (Featuring Faith Evans & 112)(Album Version) - Puff Daddy
4,0,4,5,I?'m A Steady Rollin? Man - Robert Johnson
...,...,...,...,...
20151715,981153,1475,4,Représente - Alliance Ethnik
20151716,981153,2492,6,Addams Groove - MC Hammer
20151717,981153,2257,2,Go To Sleep - Eminem / DMX / Obie Trice
20151718,981153,55499,2,We're Back - Eminem / Obie Trice / Stat Quo / Bobby Creekwater / Cashis


In [24]:
dls = CollabDataLoaders.from_df(X, item_name='song', bs=64, device='mps')
dls.show_batch()

Unnamed: 0,user_idx,song,play_count
0,483575,Tropical Jam - Michel Camilo,3
1,470980,Jolly Dance - Usher Featuring The Nu Beginning,5
2,413227,Miss Halfway - Anya Marina,6
3,820336,Ghosts 'n' Stuff (Original Instrumental Mix) - Deadmau5,6
4,957624,Don't Matter - Akon,2
5,639551,Bend Me_ Shape Me - The American Breed,30
6,809145,Bigger Isn't Better - The String Cheese Incident,5
7,55090,Watermelon Man - Oscar Brown Jr.,8
8,232465,Idealistic - Digitalism,3
9,645455,Falls Apart - Thousand Foot Krutch,2


In [25]:
n_users  = len(dls.classes['user_idx'])
n_movies = len(dls.classes['song'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [26]:
user_factors.shape, movie_factors.shape

(torch.Size([971374, 5]), torch.Size([330676, 5]))

In [27]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

In [28]:
xb, yb = dls.one_batch()
xb.shape, yb.shape

(torch.Size([64, 2]), torch.Size([64, 1]))

In [29]:
model = DotProduct(n_users, n_movies, 50).to('mps')
learn = Learner(dls, model, loss_func=MSELossFlat())

In [31]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time


KeyboardInterrupt: 