### Setup

In [None]:
import os
import requests
from datetime import date
import json
import pandas as pd
from supabase import create_client
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

str(date.today())

tmdb_api_key = os.getenv('TMDB_API_KEY')
supabase_client = create_client(os.getenv("SUPABASE_PROJECT_URL"), os.getenv("SUPABASE_API_KEY"))
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

### Get movie data

In [None]:
url = "https://api.themoviedb.org/3/discover/movie"

params = {
    "language": "en-US",
    "sort_by": "release_date.asc",
    "include_adult": "false",
    "include_video": "false",
    "primary_release_date.gte": "2023-01-01",
    "primary_release_date.lte": str(date.today()),
    "page": 1,
    "vote_count.gte": 100,
}

headers = {
    "Authorization": f"Bearer {tmdb_api_key}",
    "accept": "application/json",
}

response = requests.get(url, params=params, headers=headers)

response_json = response.json()
results = response_json["results"]
total_pages = response_json["total_pages"]

print(f"[INFO] Number of Pages found: {total_pages}")
print(f"[INFO] Page Number #1 | Response Status Code{response.status_code}")

for page_num in range(2, total_pages+1):
    params["page"] = page_num
    response = requests.get(url, params=params, headers=headers)
    print(f"[INFO] Page Number #{page_num} | Response Status Code: {response.status_code}")
    response_json = response.json()

    results.extend(response_json["results"])

In [None]:
len(results)

In [None]:
# save data to JSON
json_path = "./movies.json"

with open(json_path, "w") as file:
    json.dump(results, file, indent=4)

In [None]:
# load data from JSON
json_path = "./movies.json"
with open (json_path, "r") as file:
    data = json.load(file)

len(data)

In [None]:
data[0]

### Pre-process movie data

In [None]:
# get genre data
url = "https://api.themoviedb.org/3/genre/movie/list?language=en"

headers = {
    "Authorization": f"Bearer {tmdb_api_key}",
    "accept": "application/json",
}

response = requests.get(url, headers=headers)
genre_list = response.json()["genres"]
genre_list[:5]

In [None]:
id2genre = {pair["id"]: pair["name"] for pair in genre_list}
id2genre

In [None]:
# add genra data to movie data

data_to_insert = [
    {
        "id": item["id"],
        "title": item["title"],
        "release_date": item["release_date"],
        "genres": [id2genre[id] for id in item["genre_ids"]],
        "description": item["overview"],
        "poster": "https://image.tmdb.org/t/p/original/" + item["poster_path"],
        "rating": item["vote_average"],
    } for item in data
]

In [None]:
data_to_insert[0]

In [None]:
# concat data into string and add it to the movie data

movie_template = """
Title: {title}
Release Date: {year}
Genres: {genres}
Description: {description}
Rating: {rating}
""".strip()

for item in data_to_insert:
    item["context"] = movie_template.format(
        title=item["title"],
        year=item["release_date"],
        genres=", ".join(item["genres"]),
        description=item["description"],
        rating=item["rating"],
    )

In [None]:
print(data_to_insert[0]["context"])

In [None]:
df = pd.DataFrame.from_dict(data_to_insert)

In [None]:
df.head()

In [None]:
# save data to JSON
json_path = "./movies_clean.json"

with open(json_path, "w") as file:
    json.dump(data_to_insert, file, indent=4)

### Compute embeddings

In [None]:
# compute embeddings

def get_embedding(text, model="text-embedding-3-small"):
    # text = text.replace("\n", " ")
    return openai_client.embeddings.create(input=[text], model=model).data[0].embedding

df["embedding"] = df["context"].apply(lambda x: get_embedding(x, model="text-embedding-3-small"))

In [None]:
df.rename(columns={"id": "tmdb_id"}, inplace=True)
data_to_insert2 = df.to_dict(orient="records")
data_to_insert2

In [None]:
# save data to JSON
json_path = "./movies_clean_emb.json"

with open(json_path, "w") as file:
    json.dump(data_to_insert2, file, indent=4)

In [None]:
len(data_to_insert2[0]["embedding"])

In [None]:
data_to_insert2[0].keys()

### Upload processed data to Supabase

In [None]:
# load data from JSON
json_path = "./movies_clean_emb.json"

with open (json_path, "r") as file:
    data = json.load(file)

len(data)

data

In [None]:
# update movie data to supabase in chunks

chunk_size = 100
item_count = len(data_to_insert2)

for i in range(0, item_count, chunk_size):
    print(f"[INFO] Insering items {(i, min(i+chunk_size, item_count))}...")
    supabase_client.table("movies_tmdb").insert(data[i:min(i+chunk_size, item_count)]).execute()

print(f"[INFO] SUCCESS")