In [1]:
%%capture
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# MIE524 - Lab 5

# Collaborative Filtering

In this notebook we will write a matrix factorization model in pytorch to solve a recommendation problem.

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/. To get the data:

`http://files.grouplens.org/datasets/movielens/ml-latest-small.zip`

## MovieLens dataset

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("q1-ratings.csv")

In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Encoding data
We encode the data to have continuous ids for users and movies. You can think about this as a categorical encoding of our two categorical variables userId and movieId.

In [5]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [6]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids.
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [7]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids.
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [10]:
# check encode implementation
df_t = pd.read_csv("tiny_training2.csv")
df_v = pd.read_csv("tiny_val2.csv")
print(df_t)
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
print(df_t_e)

    userId  movieId  rating
0       11        1       4
1       11       23       5
2        2       23       5
3        2        4       3
4       31        1       4
5       31       23       4
6        4        1       5
7        4        3       2
8       52        1       1
9       52        3       4
10      61        3       5
11       7       23       1
12       7        3       3
    userId  movieId  rating
0        0        0       4
1        0        1       5
2        1        1       5
3        1        2       3
4        2        0       4
5        2        1       4
6        3        0       5
7        3        3       2
8        4        0       1
9        4        3       4
10       5        3       5
11       6        1       1
12       6        3       3


In [11]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

In [12]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
6,0,4,5.0,964980868


## Embedding layer

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [14]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(6, 3)

In [15]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[-1.4844,  1.3517, -1.9613],
         [ 0.7586, -1.8765, -1.8101],
         [ 0.9732, -1.0415, -1.3021],
         [-1.1481, -1.3205,  0.6238],
         [ 0.1294,  0.6128, -0.9951],
         [-1.4844,  1.3517, -1.9613]]], grad_fn=<EmbeddingBackward0>)

## Matrix factorization model

In [16]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

## Debugging MF model

In [17]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [18]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [19]:
U = user_emb(users)
V = item_emb(items)

In [20]:
U

tensor([[ 0.5734, -1.3145,  0.8171],
        [ 0.5734, -1.3145,  0.8171],
        [ 0.8059, -1.9813,  1.1847],
        [ 0.8059, -1.9813,  1.1847],
        [-0.0969, -1.4406, -0.1266],
        [-0.0969, -1.4406, -0.1266],
        [-0.2355,  1.8081, -0.9299],
        [-0.2355,  1.8081, -0.9299],
        [-0.3344,  0.7703, -0.0104],
        [-0.3344,  0.7703, -0.0104],
        [ 0.6541, -0.7326, -1.2818],
        [ 1.8815,  2.5154, -0.1376],
        [ 1.8815,  2.5154, -0.1376]], grad_fn=<EmbeddingBackward0>)

In [21]:
V

tensor([[-1.1927, -0.0724, -0.9955],
        [-1.3552, -0.0261,  1.6857],
        [-1.3552, -0.0261,  1.6857],
        [-0.2898,  1.4798, -0.7014],
        [-1.1927, -0.0724, -0.9955],
        [-1.3552, -0.0261,  1.6857],
        [-1.1927, -0.0724, -0.9955],
        [-0.7717, -0.5612,  1.3664],
        [-1.1927, -0.0724, -0.9955],
        [-0.7717, -0.5612,  1.3664],
        [-0.7717, -0.5612,  1.3664],
        [-1.3552, -0.0261,  1.6857],
        [-0.7717, -0.5612,  1.3664]], grad_fn=<EmbeddingBackward0>)

In [22]:
# element wise multiplication
U*V

tensor([[-0.6839,  0.0951, -0.8134],
        [-0.7771,  0.0343,  1.3774],
        [-1.0922,  0.0517,  1.9971],
        [-0.2336, -2.9318, -0.8309],
        [ 0.1156,  0.1043,  0.1260],
        [ 0.1313,  0.0376, -0.2133],
        [ 0.2809, -0.1309,  0.9258],
        [ 0.1817, -1.0147, -1.2706],
        [ 0.3988, -0.0558,  0.0104],
        [ 0.2580, -0.4323, -0.0142],
        [-0.5048,  0.4111, -1.7514],
        [-2.5499, -0.0656, -0.2319],
        [-1.4519, -1.4117, -0.1880]], grad_fn=<MulBackward0>)

In [23]:
# what we want is a dot product per row
(U*V).sum(1)

tensor([-1.4022,  0.6346,  0.9566, -3.9964,  0.3458, -0.0444,  1.0758, -2.1036,
         0.3534, -0.1885, -1.8450, -2.8474, -3.0515], grad_fn=<SumBackward1>)

## Training GMF model

In [24]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

610 8998


In [25]:
model = MF(num_users, num_items, emb_size=100)

In [26]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item())
    test_loss(model)

In [27]:
def test_loss(model):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [28]:
train_epocs(model, epochs=10, lr=0.1)

12.909994125366211
4.846724987030029
2.605041265487671
3.0928735733032227
0.8487635850906372
1.8233176469802856
2.657480001449585
2.135988712310791
1.0914498567581177
0.976857602596283
test loss 1.849 


In [29]:
train_epocs(model, epochs=15, lr=0.01)

1.6414275169372559
1.0041557550430298
0.7121437191963196
0.6614487171173096
0.726119875907898
0.8040390610694885
0.8433794379234314
0.8350298404693604
0.7926613688468933
0.7371463775634766
0.6874354481697083
0.6556161642074585
0.6448607444763184
0.6501439809799194
0.6616281270980835
test loss 0.822 


Note that these models are susceptible to weight initialization, optimization algorithm and regularization.

## Check prediction results

In [30]:
u = torch.LongTensor(df_val.userId.values) #.cuda()
v = torch.LongTensor(df_val.movieId.values) #.cuda()
mm = model(u,v).detach().numpy()

In [31]:
mm

array([5.6305885, 4.5766683, 4.2627125, ..., 4.549139 , 4.33224  ,
       4.362982 ], dtype=float32)

In [32]:
df_val.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
61626,407,651,5.0,1467371716
25186,176,856,3.0,1435534874
24826,173,560,5.0,848487313
67660,437,1246,3.5,1105649492
71027,452,159,5.0,972621282


In [33]:
u = torch.LongTensor(np.array([570])) #.cuda()
v = torch.LongTensor(np.array([8352])) #.cuda()
model(u,v).detach().numpy()

array([2.226849], dtype=float32)

In [34]:
df_train.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
30250,210,2679,4.0,1350912713
76334,479,999,4.0,1179178545
89998,584,273,4.0,1307417190
16377,104,372,3.5,1446573297
86784,560,149,3.5,1491092031


In [35]:
u = torch.LongTensor(np.array([560])) #.cuda()
v = torch.LongTensor(np.array([149])) #.cuda()
model(u,v).detach().numpy()

array([4.028096], dtype=float32)

## MF using Alternating Least Squares (ALS)

In [37]:
import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark import SparkContext, SparkConf

In [38]:
# create the session
conf = SparkConf()

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [39]:
# convert Pandas to Spark DataFrames
spark_train = spark.createDataFrame(df_train)
spark_val = spark.createDataFrame(df_val)

In [40]:
# run ALS with a maximum of 10 iterations, a rank of 10, and regularization parameter value of 0.01
als = ALS(maxIter=10, rank=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(spark_train)

In [41]:
# Compute *training* RMSE
predictions = model.transform(spark_train)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.4724074795310108


In [42]:
# Compute *validation* RMSE
predictions = model.transform(spark_val)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1069463732569145


# Zero-Shot Classification
https://huggingface.co/tasks/zero-shot-classification

In [43]:
!pip install transformers



In [44]:
from transformers import pipeline

pipe = pipeline(model="facebook/bart-large-mnli")
pipe("I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent"],
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!',
 'labels': ['urgent', 'not urgent'],
 'scores': [0.9948899745941162, 0.005109988618642092]}

In [45]:
pipe("I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["phone", "tablet", "computer"],
)

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!',
 'labels': ['phone', 'computer', 'tablet'],
 'scores': [0.9654064178466797, 0.03005870245397091, 0.004534877371042967]}