Fetching Movie Lens Dataset

In [None]:
!unzip ml-100k.zip -d sample_data

Joining various datasets

In [None]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
import json

dir = 'sample_data/ml-100k'
col_names = ['user id', 'item id', 'rating', 'timestamp']
data = pd.read_csv(os.path.join(dir, 'u.data'), delimiter='\t', names=col_names, header=None)
data['timestamp'] = data['timestamp'].apply(lambda x: datetime.fromtimestamp(x))

with open(os.path.join(dir, 'u.item'), encoding = "ISO-8859-1") as f:
  movie = pd.read_csv(f, delimiter='|', header=None)

movie.columns = ['item id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

with open(os.path.join(dir, 'u.user'), encoding = "ISO-8859-1") as f:
  user = pd.read_csv(f, delimiter='|', header=None)

user.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']

ratings = data.merge(movie[['item id', 'title']], on='item id')

ratings['like'] = ratings['rating'] > 3

ratings.sort_values(by=['user id'],inplace=True)

     

Splitting Df in test and train 

In [None]:
train_ratio = 0.9
train_size = int(len(ratings)*train_ratio)
ratings_train = ratings.sample(train_size, random_state=42)
ratings_test = ratings[~ratings.index.isin(ratings_train.index)]

Sparse Matrix

In [None]:
from scipy.sparse import csr_matrix

n_users = ratings_train['user id'].max()
n_item = ratings_train['item id'].max()
ratings_train_pos = ratings_train[ratings_train['like']]
ratings_test_pos = ratings_test[ratings_test['like']]


row=ratings_train_pos['user id'].values - 1
col=ratings_train_pos['item id'].values - 1
data=np.ones(len(ratings_train_pos))
user_item_data = csr_matrix((data, (row, col)), shape=(n_users, n_item))

In [None]:
import implicit

model = implicit.als.AlternatingLeastSquares(factors=50, random_state=42)

model.fit(user_item_data)

Function to recommend items based on above matrix

In [None]:
from sklearn.metrics import dcg_score, ndcg_score

def precision_k(actuals, recs, k=5):
  return len(set(recs[0:k]).intersection(set(actuals)))/k

def recall_k(actuals, recs, k=5):
  return len(set(recs[0:k]).intersection(set(actuals)))/len(actuals)

def dcg_k(actuals, recs, k=5):
  relevance = np.array([[float(i in actuals) for i in recs[0:k]]])
  score = k - np.arange(k)
  return dcg_score(relevance, score.reshape(1,-1), k=k)

def ndcg_k(actuals, recs, k=5):
  relevance = np.array([[float(i in actuals) for i in recs[0:k]]])
  score = k - np.arange(k)
  return ndcg_score(relevance, score.reshape(1,-1), k=k)

def recall_stage(model, user_id, user_item_data, ratings_train,N_recall ):
  filter_items = ratings_train[ratings_train['user id']==user_id]['item id'].values
  filter_items = filter_items - 1
  user_id = user_id - 1

  recs, scores = model.recommend(user_id, 
                                 user_item_data[user_id], 
                                 filter_items=filter_items,
                                 N=N_recall)
  recs = recs.flatten() + 1
  return recs,scores

def evaluate(user_id, ratings_test_pos, recs, k=5):
  actuals = ratings_test_pos[ratings_test_pos['user id']==user_id]['item id'].values
  return precision_k(actuals, recs, k), recall_k(actuals, recs, k), dcg_k(actuals, recs, k)


Evaluated Metrics for a sample user

In [None]:
recs,scores = recall_stage(model,5,user_item_data,ratings_train,5)
evaluate(5,ratings_test_pos,recs,5)

Sample Recommmendation based on Matrix Factorization for a user

In [None]:
movie.iloc[recs]

Ranking of Recommmendatios above by a LLM

In [None]:
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
import openai
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if OPENAI_API_KEY:
    print("API Key successfully retrieved!")
else:
    print("API Key not found. Please set the environment variable.")

llm_model = "gpt-3.5-turbo"
llm = ChatOpenAI(api_key = OPENAI_API_KEY,temperature=0.0, model=llm_model)

prompt = ChatPromptTemplate.from_template(
"""The person has a list of liked movies: {movies_liked}. \
The person has a list of disliked movies: {movies_disliked}. \
Tell me if this person likes each of the candidate movies: {movies_candidates}.\
Return a list of boolean values and explain why the person likes or dislikes.

<< FORMATTING >>
Return a markdown code snippet with a list of JSON object formatted to look like:
{{
    "title": string \ the name of the movie in candidate movies
    "like": boolean \ true or false
    "explanation": string \ explain why the person likes or dislikes the candidate movie
}}


REMEMBER: Each boolean and explanation for each element in candidate movies.
REMEMBER: The explanation must relate to the person's liked and disliked movies.
"""
)

chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
prompt_new = ChatPromptTemplate.from_template(
    """You are an AI assistant. Answer the following question concisely: {question}"""
)

# Create the LLM chain with the prompt and OpenAI model
chain = LLMChain(llm=llm, prompt=prompt)

# Define a sample input
sample_input = {
    "question": "What is the capital of France?"
}
chain = LLMChain(llm=llm, prompt=prompt_new)
# Run the chain with the sample input
response = chain.run(sample_input)

# Print the response
print(response)

In [None]:
def ranking_stage(chain, user_id, ratings_train, pre_recs, movie, batch_size=10):

  few_shot = ratings_train[(ratings_train['user id']==user_id)]
  if len(few_shot) >= 20:
    few_shot = few_shot.sample(20, random_state=42)
  recall_recs = movie.set_index('item id').loc[pre_recs].reset_index()

  movies_liked = ','.join(few_shot[few_shot['like']]['title'].values.tolist())
  movies_disliked = ','.join(few_shot[~few_shot['like']]['title'].values.tolist())

  n_batch = int(np.ceil(len(recall_recs)/batch_size))
  candidates = recall_recs[['item id', 'title']]
  result_json = []

  for i in range(n_batch):
    candidates_batch = candidates.iloc[i*batch_size: (i+1)*batch_size]
    movies_candidates = ','.join(candidates_batch['title'].values.tolist())
    result = chain.run(movies_liked=movies_liked, movies_disliked=movies_disliked, movies_candidates=movies_candidates)
    result_list = result.replace('\n', '').replace('},', '}\n,').split('\n,')
    result_json_batch = [json.loads(i) for i in result_list]
    result_json = result_json + result_json_batch

  result_rank = pd.DataFrame.from_dict(result_json)
  result_rank['item id'] = recall_recs['item id'].values
  result_rank = pd.concat([result_rank[result_rank['like']], result_rank[~result_rank['like']]])

  return result_rank

In [None]:
rank_result = ranking_stage(chain, 5, ratings_train, recs, movie)
rank_recs = rank_result['item id'].values

p, r, ndcg = evaluate(5, ratings_test_pos, rank_recs, k=5)