Install required libraries

In [1]:
pip install ijson --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/111.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m71.7/111.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.8/111.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [4]:
pip install -U sentence-transformers --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m61.4/86.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [5]:
#mount the google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
#import required libraries

import ijson
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer, T5Model
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi

In [None]:
#base path where all project files are stored
base_path = "drive/MyDrive/CS646"

In [None]:
# This functions creates clusters for a given user profile. It puts reviews with same rating in a single cluster

def get_clusters_for_user(user_profiles):
    clusters = {}
    clusters['1'] = []
    clusters['2'] = []
    clusters['3'] = []
    clusters['4'] = []
    clusters['5'] = []

    for profile in user_profiles:
        text = profile.get('text')
        score = profile.get('score')
        clusters[score].append(text)

    return clusters

In [None]:
# This function will return top 1 review from the user profile, using the BM25 similarity scores

def get_bm25_top1(user_profiles, query):
    clusters = {}
    clusters['1'] = []
    clusters['2'] = []
    clusters['3'] = []
    clusters['4'] = []
    clusters['5'] = []
    corpus = {}

    max_score = 0
    output_score = 0
    output_review = ""

    for profile in user_profiles:
        text = profile.get('text')
        score = profile.get('score')
        clusters[score].append(text)

        tokenized_corpus = [text.split(" ")]
        bm25 = BM25Okapi(tokenized_corpus)

        tokenized_query = query.split(" ")

        doc_scores = bm25.get_scores(tokenized_query)

        if doc_scores[0] > max_score:
            max_score = doc_scores[0]
            output_score = score
            output_review = text

    return output_score, output_review

In [None]:
# Generate the prompt from the reviews

def generate_llm_input(input, top_k_reviews):
  prompt = ""
  for score,review in top_k_reviews.items():
    prompt += str(score) + " is the score for " + '"' + review + '"' + ", and "

  prompt = prompt[:-6]
  prompt += input

  # print(prompt)

  return prompt

In [None]:
# The model to find embedding for a text

%%capture
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# get language model score for the given input review and existing relevant reviews

def get_language_model_score(input, top_k_reviews):
  model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

  prompt = generate_llm_input(input, top_k_reviews)
  # print("prompt is :" + prompt)

  inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
  outputs = model.generate(**inputs, max_new_tokens=20)
  rating = tokenizer.batch_decode(outputs, skip_special_tokens=True)

  return rating

In [None]:
# predict the ratings on the validation data

def val(start_index, end_index):
    val_ratings = {}
    count = 0
    iterated = 0
    invalid_ratings = 0
    with open(base_path + '/dev_questions.json', 'r') as file:
        parser = ijson.items(file, 'item')
        for item in parser:
            count += 1
            if count<start_index or count>end_index:
                continue
            user_id = item.get('id')
            user_profiles = item.get('profile')
            input = item.get('input')

            test_review = input.split(':')[1].strip()

            output_score, output_review = get_bm25_top1(user_profiles, test_review)

            # here k = 1
            top_k_reviews = {output_score: output_review}

            rating = get_language_model_score(input, top_k_reviews)
            rating = rating[0]
            if rating not in ('1', '2', '3', '4', '5'):
                rating = 5
                invalid_ratings += 1

            val_ratings[user_id] = rating
            # print("rating is " + rating)


            iterated += 1
            if iterated%100 == 0:
                print("iterated: " + str(iterated))

    return val_ratings


In [7]:
# Find the predictions

val_ratings = val(1,2500)

In [None]:
# Storing the predictions in a csv file

df = pd.DataFrame(val_ratings.items(), columns=['id', 'rating'])
df.head(10)
df.to_csv(base_path + '/val_ratings_bm25.csv')

Metrics Calculations

In [None]:
# Here, we fetch the predictions of our model

y_predicted = []

filenames = ['val_ratings_bm25.csv']

for filename in filenames:
    filepath = base_path + '/' + filename
    with open(filepath, 'r') as f:
        count = 0
        for line in f:
            if count != 0:
                y_predicted.append(int(line.split(',')[2]))
            count += 1

In [None]:
len(y_predicted)

2500

In [None]:
# Here, we find the ground truth outputs

import json

y_actual = []
with open(base_path + '/dev_outputs.json') as f:
    d = json.load(f)
    golds = d.get('golds')
    for gold in golds:
        y_actual.append(int(gold.get('output')))
    print("y_actual ", y_actual)


y_actual  [2, 5, 5, 1, 5, 3, 4, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 3, 5, 4, 5, 5, 4, 5, 4, 2, 3, 5, 2, 4, 5, 4, 3, 5, 5, 5, 5, 5, 4, 4, 1, 5, 5, 4, 5, 4, 5, 4, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 4, 5, 4, 1, 4, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 3, 5, 5, 1, 5, 3, 3, 4, 5, 4, 4, 3, 4, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 4, 5, 4, 2, 5, 5, 4, 5, 4, 5, 5, 5, 4, 5, 4, 5, 3, 4, 3, 3, 5, 5, 4, 4, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 4, 3, 4, 4, 5, 5, 3, 5, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 3, 1, 5, 5, 5, 1, 5, 5, 5, 4, 4, 3, 5, 5, 4, 4, 1, 4, 3, 5, 3, 2, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 4, 3, 5, 4, 2, 4, 4, 4, 1, 5, 5, 5, 4, 5, 4, 5, 5, 2, 5, 3, 4, 5, 5, 4, 5, 3, 5, 5, 1, 2, 5, 4, 3, 5, 2, 5, 2, 5, 5, 5, 4, 4, 4, 5, 4, 5, 5, 5, 5, 5, 2, 5, 5, 4, 5, 4, 5, 5, 4, 5, 5, 5, 4, 5, 4, 4, 5, 5, 1, 5, 4, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 4, 5, 4, 5, 5, 5, 3, 4, 4, 3, 4, 5, 4, 5, 5, 5, 1, 5, 1, 4, 3, 5, 4, 2, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 3, 5, 5, 3, 5, 5, 2, 5,

In [None]:
# Now, we find the RMSE and MAE metrics

from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

rms = sqrt(mean_squared_error(y_actual, y_predicted, squared=False))
mae = mean_absolute_error(y_actual, y_predicted)

print("rms ", rms)
print("mae ", mae)

rms  1.0478540826397846
mae  0.72
