# Movie Recommendation ChatBOT with Meta Data Enrichment

In [1]:
import math
import yaml, os
import numpy as np
import pandas as pd 
from pathlib import Path
from llama_index import Document
from llama_index.llms import AzureOpenAI
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from llama_index.node_parser import SimpleNodeParser
from sentence_transformers import SentenceTransformer
from llama_index.embeddings import HuggingFaceEmbedding 
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.finetuning import (
                                    generate_qa_embedding_pairs,
                                    EmbeddingQAFinetuneDataset,
                                    SentenceTransformersFinetuneEngine
                                    )

pd.set_option('display.max_columns', 100)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('/Users/1zuu/Desktop/LLM RESEARCH/LLMPro/cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['AD_OPENAI_API_KEY'] = credentials['AD_OPENAI_API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']

data_path = 'movies_metadata.csv'

In [3]:
def preprocess_dataset(row):
    belongs_to_collection = row['belongs_to_collection']
    belongs_to_collection = 'NULL' if pd.isnull(belongs_to_collection) else belongs_to_collection
    belongs_to_collection = eval(belongs_to_collection)['name'] if belongs_to_collection != 'NULL' else 'NULL'

    genres = row['genres']
    genres = 'NULL' if pd.isnull(genres) else genres
    if genres != 'NULL':
        genres = eval(genres)
        genres = [genre['name'] for genre in genres]
        
    row['belongs_to_collection'] = belongs_to_collection
    row['genres'] = genres

    return row

In [4]:
df = pd.read_csv(data_path)
df = df.apply(preprocess_dataset, axis=1)
df = df[[
        'adult',
        'belongs_to_collection',
        'budget',
        'genres',
        'original_language',
        'original_title',
        'overview',
        'popularity',
        'revenue',
        'runtime',
        'vote_average',
        'vote_count'
        ]]
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,original_language,original_title,overview,popularity,revenue,runtime,vote_average,vote_count
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,373554033.0,81.0,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,262797249.0,104.0,6.9,2413.0
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,0.0,101.0,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,81452156.0,127.0,6.1,34.0
4,False,Father of the Bride Collection,0,[Comedy],en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,76578911.0,106.0,5.7,173.0


In [5]:
def load_corpus(data_path, verbose=False):
    df = pd.read_csv(data_path)
    df = df.apply(preprocess_dataset, axis=1)
    df = df[[
            'adult',
            'belongs_to_collection',
            'budget',
            'genres',
            'original_language',
            'original_title',
            'overview',
            'popularity',
            'revenue',
            'runtime',
            'vote_average',
            'vote_count'
            ]]

    if verbose:
        print(f"Loading {len(df)} movies from {data_path}")

    documents = []
    for i, row in df.iterrows():
        doc = Document(
                    id=i,
                    text=row['overview'],
                    metadata={
                            'title': row['original_title'],
                            'genres': row['genres'],
                            'belongs_to_collection': row['belongs_to_collection'],
                            'budget': row['budget'],
                            'popularity': row['popularity'],
                            'revenue': row['revenue'],
                            'runtime': row['runtime'],
                            'vote_average': row['vote_average'],
                            'vote_count': row['vote_count']
                            })
        documents.append(doc)

    return documents

# Configure LLMs

In [6]:
embedding_llm = HuggingFaceEmbedding(
                                    model_name="BAAI/bge-small-en-v1.5",
                                    device="mps"
                                    )
llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

service_context = ServiceContext.from_defaults(
                                                embed_model=embedding_llm,
                                                llm_predictor=chat_llm
                                                )
set_global_service_context(service_context)

# Creating Dataset

In [7]:
documents = load_corpus(data_path)
documents[0]

Document(id_='d75b4244-3a4e-4cd4-93e1-bea26764e265', embedding=None, metadata={'title': 'Toy Story', 'genres': ['Animation', 'Comedy', 'Family'], 'belongs_to_collection': 'Toy Story Collection', 'budget': 30000000, 'popularity': 21.946943, 'revenue': 373554033.0, 'runtime': 81.0, 'vote_average': 7.7, 'vote_count': 5415.0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='84fe5f5b59f8380df75f70fbfe4fdf9755512a0335d5cbb0133da9cf752ee10d', text="Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [9]:
dataset = generate_qa_embedding_pairs(documents, llm=llm)
dataset.save_json("generated/dataset.json")

# Finetuning

In [None]:
dataset = EmbeddingQAFinetuneDataset.from_json("generated/dataset.json")

In [None]:
finetune_engine = SentenceTransformersFinetuneEngine(
                                                    dataset,
                                                    model_id="BAAI/bge-small-en-v1.5",
                                                    model_output_path="generated/bge-small-finetuned",
                                                    epochs=2
                                                    )

In [None]:
finetune_engine.finetune()

Iteration: 100%|██████████| 29/29 [01:43<00:00,  3.55s/it]
Iteration: 100%|██████████| 29/29 [01:45<00:00,  3.64s/it]
Epoch: 100%|██████████| 2/2 [03:28<00:00, 104.31s/it]


In [None]:
finetuned_embedding_llm = finetune_engine.get_finetuned_model()

# Finetuned Embedding Evaluation

In [None]:
def evaluate_st(
                dataset,
                model_id,
                name,
                ):
    
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [None]:
evaluate_st(dataset, "BAAI/bge-small-en-v1.5", name="bge")

0.7707418931120409

In [None]:
evaluate_st(dataset, "./generated/bge-small-finetuned", name="finetuned")

0.8467020542474263