# **Install Dependencies & Import Library**

In [None]:
!pip install -q sentence-transformers faiss-cpu groq requests nltk

In [None]:
import os
import re
import requests
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize

# **Load API Keys**

In [None]:
from groq import Groq
from kaggle_secrets import UserSecretsClient

# Load secrets dari Kaggle
user_secrets = UserSecretsClient()
GROQ_API_KEY = user_secrets.get_secret("GROQ_API_KEY")
SERPER_API_KEY = user_secrets.get_secret("SERPER_API_KEY")

client = Groq(api_key=GROQ_API_KEY)

# **Load & Cleaning Dataset**

In [None]:
df = pd.read_csv("/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")

df = df.rename(columns={
    "Title": "title",
    "Release Year": "year",
    "Origin/Ethnicity": "origin",
    "Director": "director",
    "Cast": "cast",
    "Genre": "genre",
    "Wiki Page": "link",
    "Plot": "plot"
})

df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

df = df.fillna("Unknown")

In [None]:
df_cleaned = df.copy()
df_cleaned = df_cleaned.dropna(subset=["title", "plot"])
df_cleaned = df_cleaned.drop_duplicates(subset=["title", "year"])

cols_to_fill = ["origin", "director", "cast", "genre", "link"]
for c in cols_to_fill:
    df_cleaned[c] = df_cleaned[c].fillna("Unknown")

print(f"Total baris sebelum cleaning: {len(df):,}")
print(f"Total baris setelah cleaning: {len(df_cleaned):,}")
print(f"Total data yang dihapus: {len(df) - len(df_cleaned):,}\n")

print("Missing value per kolom:")
print(df_cleaned.isnull().sum(), "\n")

display(df_cleaned.head())

# **Chunking**

In [None]:
def split_plot_into_chunks(text, max_words=150, overlap=30):
    if not isinstance(text, str) or not text.strip():
        return []

    words = text.split()
    chunks = []

    i = 0
    while i < len(words):
        chunk_words = words[i:i + max_words]
        chunks.append(" ".join(chunk_words))
        i += max_words - overlap

    return chunks

In [None]:
docs = []
metas = []

for i, row in df_cleaned.iterrows():

    title   = str(row["title"]).strip()
    year    = int(row["year"])
    origin  = str(row["origin"]).strip()
    genre   = str(row["genre"]).strip()
    director = str(row["director"]).strip()
    cast     = str(row["cast"]).strip()
    plot     = str(row["plot"]).strip()

    # Chunk 1: metadata pendek
    meta_text = f"""
Title: {title}
Year: {year}
Origin: {origin}
Genre: {genre}
Director: {director}
Cast: {cast}
""".strip()

    docs.append(meta_text)
    metas.append({
        "movie_id": i,
        "chunk_type": "meta",
        "title": title,
        "year": year,
        "origin": origin,
        "genre": genre,
        "director": director,
        "cast": cast
    })

    # Chunk 2: plot yg sudah di split
    plot_chunks = split_plot_into_chunks(plot, max_words=150, overlap=30)

    for ch in plot_chunks:
        docs.append(ch)    
        metas.append({
            "movie_id": i,
            "chunk_type": "plot",
            "title": title,
            "year": year,
            "origin": origin,
            "genre": genre,
            "director": director,
            "cast": cast
        })

total_chunks = len(docs)
total_movies = len(df_cleaned)

print(f"Total film      : {total_movies}")
print(f"Total chunk     : {total_chunks}")
print(f"Rata-rata chunk : {total_chunks / total_movies:.2f} per film")

# **Load Embedding Model & Encode**

In [None]:
# Load embedding model
EMB_MODEL = "BAAI/bge-small-en-v1.5"
emb_model = SentenceTransformer(EMB_MODEL)

# Encode semua dokumen
embeddings = emb_model.encode(
    docs,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

# Pastikan float32
embeddings = embeddings.astype("float32")

print("Embeddings shape:", embeddings.shape)

d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeddings)

print("Total vektor di index:", index.ntotal)

# **Normalization, Boosting & Search Function**

In [None]:
def normalize_title(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9 ]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def title_boost(query, candidates):
    q = normalize_title(query)

    for c in candidates:
        title = normalize_title(c["meta"]["title"])

        if q == title:
            c["score"] += 5.0
            continue
        
        if q in title and len(q.split()) > 1:
            c["score"] += 2.5
            continue
        
        q_words = set(q.split())
        t_words = set(title.split())
        overlap = len(q_words & t_words)

        if overlap >= 2:
            c["score"] += 1.0

    return candidates

In [None]:
def normalize_tokens(s):
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9, /]+", "", s)
    tokens = re.split(r"[,/ ]+", s)
    return [t for t in tokens if t]

def match_filter(meta, genre=None, origin=None, min_year=None, max_year=None):
    """Filter berdasarkan genre, origin, tahun dengan cara yang aman."""

    y = meta["year"]
    genre_tokens = normalize_tokens(meta["genre"] or "")
    origin_tokens = normalize_tokens(meta["origin"] or "")

    # Year filter
    if min_year is not None and y < min_year:
        return False
    if max_year is not None and y > max_year:
        return False

    # Genre filter
    if genre:
        g = genre.lower().strip()
        if g not in genre_tokens:
            return False

    # Origin filter
    if origin:
        o = origin.lower().strip()
        if o not in origin_tokens:
            return False

    return True

In [None]:
def search_chunks(query, top_k=200):
    # embed query
    q_emb = emb_model.encode(
        [query], 
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    scores, idx = index.search(q_emb, top_k)
    scores = scores[0]
    idx = idx[0]

    results = []
    for s, i_doc in zip(scores, idx):
        results.append({
            "score": float(s),
            "chunk_id": int(i_doc),
            "text": docs[i_doc],
            "meta": metas[i_doc],   
        })

    return results

In [None]:
def search_movies(
    query,
    top_k_movies=5,
    top_k_chunks=200,
    genre=None,
    origin=None,
    min_year=None,
    max_year=None,
):
    # Ambil chunk dari FAISS
    chunks = search_chunks(query, top_k=top_k_chunks)

    movie_map = {}  

    for ch in chunks:
        meta = ch["meta"]
        movie_id = meta["movie_id"]

        # Filter
        if not match_filter(meta, genre, origin, min_year, max_year):
            continue

        # Simpan skor tertinggi per film
        if movie_id not in movie_map or ch["score"] > movie_map[movie_id]["score"]:
            movie_map[movie_id] = {
                "movie_id": movie_id,
                "score": ch["score"],
                "text": ch["text"],
                "meta": meta,
            }

    # Jika filter kosong, pakai semua chunk (fallback)
    if len(movie_map) == 0:
        movie_map = {}
        for ch in chunks:
            meta = ch["meta"]
            movie_id = meta["movie_id"]
            if movie_id not in movie_map or ch["score"] > movie_map[movie_id]["score"]:
                movie_map[movie_id] = {
                    "movie_id": movie_id,
                    "score": ch["score"],
                    "text": ch["text"],
                    "meta": meta,
                }

    # Convert ke list
    movie_list = list(movie_map.values())

    # BOOST ranking dengan judul
    movie_list = title_boost(query, movie_list)

    # Final sorting
    movie_list = sorted(movie_list, key=lambda x: x["score"], reverse=True)

    return movie_list[:top_k_movies]

In [None]:
def clean_web_text(text):
    if not text:
        return ""
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def web_search_serper(query):
    url = "https://google.serper.dev/search"
    headers = {
        "X-API-KEY": SERPER_API_KEY,
        "Content-Type": "application/json",
    }
    payload = {"q": query, "num": 5}

    try:
        resp = requests.post(url, json=payload, headers=headers, timeout=7)
        data = resp.json()
    except Exception:
        return None

    collected = []

    if "answerBox" in data:
        ab = data["answerBox"]
        for k in ["answer", "snippet", "title"]:
            if k in ab:
                collected.append(clean_web_text(ab[k]))

    if "knowledgeGraph" in data:
        kg = data["knowledgeGraph"]
        for k in ["description", "type", "title"]:
            if k in kg:
                collected.append(clean_web_text(kg[k]))

    if "organic" in data:
        for item in data["organic"]:
            title = clean_web_text(item.get("title", ""))
            snippet = clean_web_text(item.get("snippet", ""))
            combined = (title + "\n" + snippet).strip()
            if combined:
                collected.append(combined)

    return "\n\n".join(collected[:5]) if collected else None

# **Building Chatbot**

In [None]:
def build_prompt(query, movies, web_context=None):

    movie_blocks = []
    for m in movies:
        meta = m["meta"]
        block = f"""
Judul: {meta['title']}
Tahun: {meta['year']}
Asal: {meta['origin']}
Genre: {meta['genre']}
Director: {meta['director']}

Ringkasan:
{m['text']}
""".strip()
        movie_blocks.append(block)

    ctx_movies = "\n\n---\n\n".join(movie_blocks)
    ctx_web = web_context.strip() if web_context else ""

    prompt = f"""
Kamu adalah teman ngobrol yang paham banyak film. 
Gaya bicaramu santai, jelas, ringan, tidak formal, dan tidak kaku.

Tugasmu:
- Jawab pertanyaan pengguna dengan santai dan natural.
- Gunakan informasi yang tersedia dalam konteks film dan konteks web.
- Jika ada beberapa film relevan, sebutkan 1–3 yang paling cocok dan jelasin singkat kenapa.
- Jangan bawa info dari luar konteks.
- Kalau infonya tidak ada, bilang saja dengan santai “nggak ada infonya nih”.

========================
Konteks Film:
{ctx_movies}

========================
Konteks Web:
{ctx_web}

========================
Pertanyaan:
{query}

Jawaban santai:
"""
    return prompt

In [None]:
def groq_generate(prompt):
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {
                "role": "system",
                "content": (
                    "Kamu adalah teman ngobrol yang paham film. "
                    "Gaya bicaramu santai, natural, tidak formal. "
                    "Jawab hanya sesuai konteks yang diberikan. "
                    "Kalau info tidak ada, bilang saja dengan santai."
                )
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0.35,
        max_tokens=400,
    )
    return response.choices[0].message.content

In [None]:
def lookup_exact_title(query):
    q = normalize_title(query)

    # cari judul yang paling mendekati exact
    df_cleaned["norm_title"] = df_cleaned["title"].apply(normalize_title)

    # exact match
    exact = df_cleaned[df_cleaned["norm_title"] == q]
    if len(exact) == 0:
        return None

    row = exact.iloc[0]

    # text = metadata + full plot
    text = f"""
Title: {row['title']}
Year: {row['year']}
Origin: {row['origin']}
Genre: {row['genre']}
Director: {row['director']}
Cast: {row['cast']}

Plot:
{row['plot']}
""".strip()

    meta = {
        "title": row["title"],
        "year": int(row["year"]),
        "origin": row["origin"],
        "genre": row["genre"],
        "director": row["director"],
        "cast": row["cast"]
    }

    return {"score": 5.0, "text": text, "meta": meta}

In [None]:
def movie_chatbot(
    query,
    top_k_movies=5,
    genre=None,
    origin=None,
    min_year=None,
    max_year=None,
    score_threshold=0.25
):
    q_words = query.strip().split()
    if len(q_words) <= 3:        
        exact = lookup_exact_title(query)
        if exact is not None:
            movies = [exact]
            prompt = build_prompt(query, movies, web_context=None)
            answer = groq_generate(prompt)
            return answer, movies

    movies = search_movies(
        query=query,
        top_k_movies=top_k_movies,
        top_k_chunks=200,
        genre=genre,
        origin=origin,
        min_year=min_year,
        max_year=max_year,
    )

    movies = title_boost(query, movies)
    movies = sorted(movies, key=lambda x: x["score"], reverse=True)

    best_score = movies[0]["score"] if movies else 0.0

    web_ctx = None
    if best_score < score_threshold:
        web_ctx = web_search_serper(query)

    prompt = build_prompt(query, movies, web_context=web_ctx)

    answer = groq_generate(prompt)

    return answer, movies