# 🎬 IMDb Review Sentiment & Summary (RAG + IBM Granite)

Notebook ini mendemonstrasikan pipeline RAG (Retrieval-Augmented Generation) untuk mengklasifikasikan sentimen dan merangkum ulasan film menggunakan model IBM Granite via Replicate.

In [None]:
!pip install faiss-cpu replicate scikit-learn langdetect python-dotenv


In [None]:
import pandas as pd
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import replicate
import os
from langdetect import detect
from dotenv import load_dotenv

load_dotenv()
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN", "PASTE_YOUR_KEY_IF_NOT_USING_ENV")


In [None]:
# Upload atau mount drive untuk akses file
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("IMDB_Dataset.csv")
df.head()


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['review'])
embeddings = tfidf_matrix.toarray().astype('float32')

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} documents.")


In [None]:
def retrieve_reviews(query, top_k=3):
    query_vec = vectorizer.transform([query]).toarray().astype('float32')
    distances, indices = index.search(query_vec, top_k)
    return df.iloc[indices[0]]['review'].tolist()


In [None]:
def build_prompt(reviews, lang):
    context = "\n\n".join(reviews)
    if lang == 'id':
        system_prompt = "Kamu adalah asisten AI yang mengklasifikasikan dan meringkas ulasan film. Jawablah dalam Bahasa Indonesia."
        prompt = f"Klasifikasikan dan ringkas review berikut:\n\n{context}\n\nFormat:\nSentimen: [Positif/Negatif]\nRingkasan: [Satu kalimat]\nAlasan: [Penjelasan singkat]"
    else:
        system_prompt = "You are an AI assistant that classifies and summarizes movie reviews."
        prompt = f"Classify and summarize the following reviews:\n\n{context}\n\nFormat:\nSentiment: [Positive/Negative]\nSummary: [One-sentence summary]\nReason: [Short explanation]"
    return system_prompt, prompt


In [None]:
def query_model(prompt, system_prompt):
    output = replicate.run(
        "ibm-granite/granite-3.3-8b-instruct",
        input={"prompt": prompt, "system_prompt": system_prompt}
    )
    return "".join(output)


In [None]:
query = input("Masukkan review atau topik pencarian: ")
lang = detect(query)
retrieved = retrieve_reviews(query)
system_prompt, user_prompt = build_prompt(retrieved, lang)
result = query_model(user_prompt, system_prompt)

print("\n--- RESULT ---\n")
print(result)
