In [1]:
from fastai.collab import *
from fastai.tabular.all import *
# get path to MovieLens data
path = untar_data(URLs.ML_100k)

In [2]:
# panda is commonly used to work with structured data in tabular form
# - csv, xlsx

# read in a csv file
ratings = pd.read_csv(
    # read the 'u.data' csv file at the MovieLens data path
    path/'u.data',
    # default is comma separated, but MovieLens is tab separated
    delimiter='\t',
    # csv does NOT have a header row
    header=None,
    # use these as column names
    names=['user', 'movie', 'rating', 'timestamp']
)

# displays the first N (default=5) rows of ratings table
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movies = pd.read_csv(
    path/'u.item',
    delimiter='|',
    encoding='latin-1',
    usecols=(0,1),
    names=('movie', 'title'),
    header=None
)

movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
ratings=ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [5]:
dls=CollabDataLoaders.from_df(
    ratings,
    item_name='title',
    bs=64
)

dls.show_batch()

Unnamed: 0,user,title,rating
0,308,Terminator 2: Judgment Day (1991),4
1,727,Big Bully (1996),2
2,917,Romy and Michele's High School Reunion (1997),2
3,206,Event Horizon (1997),3
4,169,M*A*S*H (1970),5
5,286,When Harry Met Sally... (1989),4
6,880,Clockers (1995),4
7,621,Twister (1996),3
8,233,Patton (1970),4
9,3,In the Name of the Father (1993),2


In [6]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [7]:
one_hot_3 = one_hot(3, n_users).float()
# print actual index look up
print(user_factors[3])
# should be equal to look up as a matrix operation using one hot encoding
user_factors.t() @ one_hot_3

tensor([0.0821, 1.1354, 1.1286, 1.8296, 0.4124])


tensor([0.0821, 1.1354, 1.1286, 1.8296, 0.4124])

In [8]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)

    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

In [9]:
x,y = dls.one_batch()
x.shape

torch.Size([64, 2])

In [10]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [11]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.319795,1.315767,00:07
1,1.090916,1.069758,00:06
2,0.971764,0.972436,00:06
3,0.859432,0.889331,00:08
4,0.775916,0.869695,00:06


In [13]:
def sigmoid_range_mt(x, low, high):
    "Sigmoid function with range `(low, high)`"
    return torch.sigmoid(x) * (high - low) + low

In [14]:
# @audit : Improve model by forcing prediction between 0 and 5 somehow?

class DotProductM(Module):
    def __init__(
        self, 
        n_users, 
        n_movies, 
        n_factors,
        y_range=(0, 5.5)
    ):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        # @audit : WTF ... why does manually copyting the sigmoid_range
        # fastai code work with m1 chip ... but NOT when calling function
        # from fastai library?  What xform is fastai applying that is 
        # preventing m1 from working LOL
        return sigmoid_range_mt((users*movies).sum(dim=1), *self.y_range)

In [15]:
model = DotProductM(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.976687,0.976521,00:07
1,0.896578,0.89339,00:07
2,0.677578,0.861502,00:07
3,0.484772,0.863324,00:07
4,0.372025,0.867838,00:07
