In [3]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import torch
import torch.nn as nn
import re
import nltk
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)  
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w in stop_words]
    ps = PorterStemmer()
    text = [ps.stem(w) for w in text]
    text = ' '.join(text)
    return text

In [6]:
dataset_address = "/content/drive/MyDrive/IR/HW3/data.json"

with open(dataset_address) as f:
    data = json.load(f)

train_dataset = pd.DataFrame(data["train"])
test_dataset = pd.DataFrame(data["test"])
validation_dataset = pd.DataFrame(data["val"])

In [7]:
# drop pics column
train_dataset = train_dataset.drop(columns=["pics"])
test_dataset = test_dataset.drop(columns=["pics"])
validation_dataset = validation_dataset.drop(columns=["pics"])


In [8]:
user_id = np.unique(np.concatenate((train_dataset["user_id"].values, test_dataset["user_id"].values, validation_dataset["user_id"].values)))
business_id = np.unique(np.concatenate((train_dataset["business_id"].values, test_dataset["business_id"].values, validation_dataset["business_id"].values)))

# create a dictionary for user_id and business_id
user_id_to_index = {user_id[i]: i for i in range(len(user_id))}
business_id_to_index = {business_id[i]: i for i in range(len(business_id))}

In [9]:
# convert user_id and business_id to user index and business index
train_dataset["user_index"] = train_dataset["user_id"].map(user_id_to_index)
train_dataset["business_index"] = train_dataset["business_id"].map(business_id_to_index)

test_dataset["user_index"] = test_dataset["user_id"].map(user_id_to_index)
test_dataset["business_index"] = test_dataset["business_id"].map(business_id_to_index)

validation_dataset["user_index"] = validation_dataset["user_id"].map(user_id_to_index)
validation_dataset["business_index"] = validation_dataset["business_id"].map(business_id_to_index)

In [10]:
# preprocess text
train_dataset["review_text"] = train_dataset["review_text"].apply(preprocess_text)
test_dataset["review_text"] = test_dataset["review_text"].apply(preprocess_text)
validation_dataset["review_text"] = validation_dataset["review_text"].apply(preprocess_text)

In [11]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m87.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1


In [12]:
# use pretrained bert model
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
class create_dataset(Dataset):
    def __init__(self, dataset, tokenizer, model, device='cuda'):
        self.dataset = dataset
        self.embed_user = self.bert_embed_creator(tokenizer, model.to(device), dataset, device, kind='user')
        self.embed_resturant = self.bert_embed_creator(tokenizer, model.to(device), dataset, device, kind='business')

    def bert_embed_creator(self,tokenizer, model, dataset, device, kind):
      model.eval()
      with torch.no_grad():
          dataframe = list()
          coulum = kind + "_index"
          for item in tqdm(dataset[coulum].unique()):
              reviews =  dataset[dataset[coulum] == item]['review_text'].to_list() 
              tokens = tokenizer(reviews, padding='max_length', max_length=128, truncation=True, return_tensors="pt").to(device)
              outputs = model(**tokens)
              embed = outputs.last_hidden_state.mean(dim=1)
              embed = embed.mean(dim=0)
              embed = embed.cpu().detach().numpy()
              dataframe.append((item, embed))
          dataframe = pd.DataFrame(dataframe, columns=[kind, 'review'])
          dataframe = dataframe.set_index(kind)
      return dataframe

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        item = self.dataset.iloc[index]
        user_index = item['user_index']
        business_index = item['business_index']
        user_review = torch.tensor(self.embed_user.loc[user_index]['review']).to(torch.float)
        business_review = torch.tensor(self.embed_resturant.loc[business_index]['review']).to(torch.float)
        output = {'user': user_index,'business': business_index,'label': item['rating'],'business_review': business_review,'user_review': user_review}
        return output

In [14]:
train_class = create_dataset(train_dataset, bert_tokenizer=tokenizer, bert_model=model)
validation_class = create_dataset(validation_dataset, bert_tokenizer=tokenizer, bert_model=model)
test_class = create_dataset(test_dataset, bert_tokenizer=tokenizer, bert_model=model)

train_loader = DataLoader(train_class, batch_size=1024, shuffle=True)
val_loader = DataLoader(validation_class, batch_size=1024, shuffle=False)
test_loader = DataLoader(test_class, batch_size=1024, shuffle=False)

Creating Reviews-based Embedding for User: 100%|██████████| 29596/29596 [13:15<00:00, 37.19it/s]
Creating Reviews-based Embedding for Business: 100%|██████████| 27896/27896 [14:13<00:00, 32.67it/s]
Creating Reviews-based Embedding for User: 100%|██████████| 3700/3700 [01:37<00:00, 37.87it/s]
Creating Reviews-based Embedding for Business: 100%|██████████| 7835/7835 [02:07<00:00, 61.68it/s]
Creating Reviews-based Embedding for User: 100%|██████████| 3700/3700 [01:38<00:00, 37.56it/s]
Creating Reviews-based Embedding for Business: 100%|██████████| 7880/7880 [02:07<00:00, 61.89it/s]


In [15]:
class NCF_with_review(nn.Module):
    def __init__(self, num_users, num_items, latent_dim, hidden_dim,text_dim):
        super(NCF_with_review, self).__init__()
        self.user_embedding = nn.Embedding(num_users+1, latent_dim)
        self.item_embedding = nn.Embedding(num_items+1, latent_dim)
        self.mlp = nn.Sequential(
            nn.Linear(2 * latent_dim +  2 * text_dim , hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Linear(hidden_dim//2, hidden_dim//4),
            nn.ReLU(),
            nn.Linear(hidden_dim//4, 1)
        )
        
    def forward(self, user_idx, item_idx ,resturant_review_embedding ,user_review_embedding):
        users = self.user_embedding(user_idx)
        items = self.item_embedding(item_idx)
        concat = torch.cat([users, items,user_review_embedding , resturant_review_embedding], dim=1)
        return self.mlp(concat)

In [16]:
num_users = len(set(user_id_to_index))
num_items = len(set(business_id_to_index))
latent_dim = 2024
hidden_dim = 4048
text_dim = 768
model = NCF_with_review(num_users, num_items, latent_dim, hidden_dim,text_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

In [None]:
train_loss , validation_loss = [], []
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for data in train_loader:
        outputs = model(data["user"], data["business"],data["business_review"],data["user_review"])
        loss = criterion(outputs.reshape(-1, 1), data["label"].float().reshape(-1, 1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    model.eval()
    with torch.no_grad():
        total = 0
        correct = 0
        for val_data in val_loader:
            outputs = model(val_data["user"], val_data["business"],val_data["business_review"],val_data["user_review"])
            total += len(val_data["label"])
            correct_prediction = (torch.abs(outputs.reshape(-1, 1) - val_data["label"].float().reshape(-1, 1)) < 1).sum().item()
            correct += correct_prediction
        print('Validation Accuracy of the model on the validation data: {} %'.format(100 * correct / total))
    train_loss.append(loss.item())
    validation_loss.append(100 * correct / total)

In [18]:
# test the model
model.eval()
with torch.no_grad():
    total = 0
    correct = 0
    predicts = []
    for test_data in test_loader:
        outputs = model(test_data["user"], test_data["business"],test_data["business_review"],test_data["user_review"])
        predicts.append(outputs)
        total += len(test_data["label"])
        correct += (torch.abs(outputs.reshape(-1,1) - test_data["label"].float().reshape(-1,1)) < 1).sum().item()
    print('Test Accuracy of the model on the test data: {} %'.format(100 * correct / total))

Test Accuracy of the model on the test data: 89.05129369042216 %


In [19]:
test_dataset["predicts"] = torch.cat(predicts).numpy()

In [20]:
def recall_at_k(real_output, prediction, k=5):
    output = 0
    if sum(real_output[:k]) != 0:
      count = [1 if (a == b and a != 0) else 0 for a, b in zip(real_output, prediction)]
      output = sum(count[:k]) / sum(real_output[:k])
    return output

In [21]:
def precision_at_k(real_output, prediction, k=5):
    count = [1 if a == b else 0 for a, b in zip(real_output, prediction)]
    output =  sum(count[:k]) / k
    return output

In [23]:
def evaluate(dataset, k=5):
    precisions = []
    recalls = []
    users = dataset.groupby(by='user_index')
    for _, data in users:
        predictions = np.array(data['predicts']).reshape(-1)
        if(len(predictions) < k):
            continue
        i = np.argsort(predictions)[::-1]
        tr_prediction = [1 if ele >= 3 else 0 for ele in predictions[i]]
        tr_real = [1 if ele >= 3 else 0 for ele in np.array(data['rating'])[i]]
        precisions.append(precision_at_k(tr_real, tr_prediction, k=k))
        recalls.append(recall_at_k(tr_real, tr_prediction, k=k))
    return precisions, recalls

In [25]:
def evaluate_2(dataset, k=5):
    precisions = []
    recalls = []
    users = dataset.groupby(by='user_index')
    for _, data in users:
        predictions = np.array(data['predicts']).reshape(-1)
        if(len(predictions) < k):
            continue
        i = np.argsort(predictions)[::-1]
        tr_prediction = [1 if item >= 3.5 else 0 for item in predictions[i]]
        tr_real = [1 if item >= 3.5 else 0 for item in np.array(data['rating'])[i]]
        precisions.append(precision_at_k(tr_real, tr_prediction, k=k))
        recalls.append(recall_at_k(tr_real, tr_prediction, k=k))
    return precisions, recalls


In [None]:
precisions, recalls = evaluate(test_dataset)

In [24]:
print("Precision@5: ", np.mean(precisions))
print("Recall@5: ", np.mean(recalls))

Precision@5:  0.9678733031674208
Recall@5:  1.0


In [26]:
precisions, recalls = evaluate_2(test_dataset)

In [27]:
print("Precision@5: ", np.mean(precisions))
print("Recall@5: ", np.mean(recalls))

Precision@5:  0.8891402714932126
Recall@5:  1.0


In [28]:
precisions, recalls = evaluate(test_dataset,k=4)

In [29]:
print("Precision@4: ", np.mean(precisions))
print("Recall@4: ", np.mean(recalls))

Precision@4:  0.9614873837981408
Recall@4:  1.0


In [30]:
precisions, recalls = evaluate_2(test_dataset,k=4)

In [31]:
print("Precision@4: ", np.mean(precisions))
print("Recall@4: ", np.mean(recalls))

Precision@4:  0.8844621513944223
Recall@4:  1.0
