In [None]:
pip install langchain langchain-openai duckdb duckdb-engine sentence-transformers

In [None]:
!pip install PyMovieDb

In [None]:
# lets do my imports
from PyMovieDb import IMDB
import polars as pl

In [None]:
from sentence_transformers import SentenceTransformer

# lets start out with a very small sentence transformer.
# I belive in starting with the smallest model first and then scaling only once you can
# measure performance.
# this is likely a place to start increasing performance
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
imdb = IMDB()

In [None]:
# lets load in the datasets

In [None]:
links = pl.read_csv("links.csv", schema_overrides={"imdbId": pl.String})
movies = pl.read_csv("movies.csv")
ratings = pl.read_csv("ratings.csv")
tags = pl.read_csv("tags.csv")

In [None]:
ratings_by_movie = ratings.group_by("movieId").agg(
    [
        pl.col("rating").mean().alias("mean_rating"),
        pl.col("rating").count().alias("n_rating"),
    ]
)

In [None]:
# some of the tags are not great. With more time, I'd pull in descriptions for the movies from imdb
# you could also think about doing something more personalized here. We know what kind of user now is going to use
# certain tags for movies. This could be an intersting way to think about linking users in an embedding space
# something like movie embeddings and user embeddings in the same semantic space

In [None]:
movies_by_tags = tags.group_by("movieId").agg(pl.col("tag"))
movies_by_tags = movies_by_tags.with_columns(pl.col("tag").list.unique())
movies_by_tags = movies_by_tags.with_columns(pl.col("tag").list.join(", "))
movies_by_tags = movies_by_tags.join(movies, on="movieId")
movies_by_tags = movies_by_tags.with_columns(
    pl.concat_str(
        [
            pl.col("title"),
            pl.col("genres").str.split("|").list.join(", "),
            pl.col("tag"),
        ],
        separator=" ",
    ).alias("description")
)

In [None]:
descriptions = movies_by_tags["description"].to_list()
description_embeddings = model.encode(descriptions, show_progress_bar=True)

In [None]:
movies_by_tags = movies_by_tags.with_columns(
    pl.lit(description_embeddings).alias("embedding")
)

In [None]:
# now I have embeddings but I want to do something clever. I concatenated together all of the tags for a given movie
# This works because of super position. The encoded representation of for example "pirate" and "zombie" are unlikely to be
# mutually exclusive. So when I concat it all together it can contain information about both.

# now I can take when a user likes a movie, and create an embedding for that user based on the films they like.
# I can decay the magnitude of that vector (because only angle encodes meaning) based on how much they liked the film.
# I can even subtract out the embedding if they did not like the film. This will be a key tool of the agent. \


# some movie have no ratings.

In [None]:
movies_with_metadata = (
    movies.join(links, on="movieId")
    .join(movies_by_tags, on="movieId")
    .join(ratings_by_movie, on="movieId")
    .sort("n_rating")
)

In [None]:
a = movies_with_metadata.sort("movieId")["description"].to_list()[0]

In [None]:
def bayesian_average(
    df: pl.DataFrame,
    rating_col: str = "mean_rating",
    n_rating_col: str = "n_rating",
    global_mean: float | None = None,
    min_n_ratings: int = 1,
    prior_weight: float = 0.5,
) -> pl.DataFrame:
    if global_mean is None:
        global_mean = (
            df.filter(pl.col(n_rating_col) >= min_n_ratings)
            .select(pl.col(rating_col).mean())
            .item()
        )

    # Compute Bayesian average
    bayesian_avg_expr = (
        pl.col(rating_col) * pl.col(n_rating_col) + global_mean * prior_weight
    ) / (pl.col(n_rating_col) + prior_weight)

    return df.with_columns(bayesian_avg=bayesian_avg_expr)

In [None]:
movies_with_metadata = bayesian_average(movies_with_metadata)

In [None]:
movies_with_metadata.filter(pl.col("title").str.contains("Godfather"))

In [None]:
import duckdb

conn = duckdb.connect("cinema-expert")

conn.sql("install vss")

In [None]:
conn.sql("""INSTALL rapidfuzz FROM community;
LOAD rapidfuzz;""")

In [None]:
conn.sql("create or replace Table movie as select * from movies_with_metadata")

In [None]:
conn.sql("create or replace Table user as select * from ratings")

In [None]:
! pip install tmdbsimple

In [None]:
conn.sql("""
SELECT
    title,
    rapidfuzz_jaro_winkler_normalized_similarity('godfather part I', lower(title)) * bayesian_avg as similarity
FROM movie
ORDER BY similarity DESC
""")

In [None]:
search = tmdb.search

In [None]:
search()

In [None]:
def _search(query, bayesian_avg, n_rating, k=10):
    embedding = model.encode(query)
    embedding = [[float(x) for x in embedding]]
    out = conn.execute(
        f"""
        SELECT
            movieId, 
            title,
            bayesian_avg, 
            n_rating,
            array_cosine_similarity(embedding, ?::FLOAT[384]) AS similarity
        FROM movie
        WHERE 
            bayesian_avg >= {bayesian_avg} and
            n_rating >= {n_rating}
        ORDER BY similarity DESC
        LIMIT {k}
        """,
        embedding,
    ).fetchall()
    return [x[1] for x in out]

In [None]:
def get_movie_recommendation(
    user_request, k=10, user_desires_critically_acclaimed=False
):
    if user_desires_critically_acclaimed:
        return _search(user_request, 3, 50, k)
    else:
        return _search(user_request, 0, 1, k)


def get_movie_information(movie, n_results=3):
    print(movie)
    search = tmdb.Search()
    result = search.movie(query=movie)
    print(result)
    out = []
    ids = [result["results"][x]["id"] for x in range(n_results)]
    for id in ids:
        movie_info = {}
        movie = tmdb.Movies(id)
        movie_info["info"] = movie.info()
        movie_info["credits"] = movie.credits()
        out.append(movie_info)
    return out

In [None]:
import tmdbsimple as tmdb

In [None]:
from openai import OpenAI
import json

client = OpenAI()

In [None]:
tools = [
    {
        "type": "function",
        "name": "get_movie_recommendation",
        "description": "Get a film recommendation",
        "parameters": {
            "type": "object",
            "properties": {
                "user_request": {"type": "string"},
                "k": {"type": "integer"},
                "user_desires_critically_acclaimed": {"type": "boolean"},
            },
            "required": ["user_request"],
        },
    },
    {
        "type": "function",
        "name": "get_movie_information",
        "description": "Get more information about a certain movie",
        "parameters": {
            "type": "object",
            "properties": {
                "movie": {"type": "string"},
                "n_results": {"type": "integer"},
            },
            "required": ["movie"],
        },
    },
    {
        "type": "web_search",
        "filters": {
            "allowed_domains": [
                "cinephiliabeyond.org",
                "rightwalldarkroom.com",
                "filmanalysis.yale.edu",
                "researchguides.dartmouth.edu",
                "wikipedia.com",
            ]
        },
    },
]


messages = [
    {
        "role": "system",
        "content": "Always call get_movie_recommendation when recommending flim. Always call get_movie_information when responing to factual requests. Sythesis all the information, but make sure to use the reccomended films. Use web search to resarch film critism questions.?",
    },
    {
        "role": "user",
        "content": """I love the movie white chicks but my cinema friend does not. What are some movies we could watch together that might be a little more highbrow.""",
    },
]

# First pass â†’ model issues tool_call
resp1 = client.responses.create(
    model="gpt-5",
    tools=tools,
    tool_choice="auto",
    input=messages,
)


messages += resp1.output

for item in resp1.output:
    if item.type == "function_call":
        if item.name == "get_movie_recommendation":
            vss = get_movie_recommendation(**json.loads(item.arguments))
            print(vss)
            messages.append(
                {
                    "type": "function_call_output",
                    "call_id": item.call_id,
                    "output": json.dumps({"movie_recommendation": vss}),
                }
            )
        elif item.name == "get_movie_information":
            vss = get_movie_information(**json.loads(item.arguments))
            messages.append(
                {
                    "type": "function_call_output",
                    "call_id": item.call_id,
                    "output": json.dumps({"movie_information": vss}),
                }
            )
resp2 = client.responses.create(
    model="gpt-5",
    tools=tools,
    input=messages,
)

print("\nFinal Answer:\n")
print(resp2.output_text)