In [2]:
import os
from openai import OpenAI
import pandas as pd

#os.environ["PERPLEXITY_API_KEY"] = 'key' # should be removed when publishing the code

client = OpenAI(
    api_key=os.environ["PERPLEXITY_API_KEY"],
    base_url="https://api.perplexity.ai"
)

def perplexity(prompt, system_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]

    try:
        response = client.chat.completions.create(
            model="sonar",  
            messages=messages,
            temperature=0,
        )
        return response.choices[0].message.content.strip(), []  
    except Exception as e:
        print("Error:", e)
        return "", []


In [None]:
import os

# Load the main dataframe
df = pd.read_csv("letterboxd_urls_and_ratings.csv")

# Check for existing tags file and load it if present
tags_file = "letterboxd_urls_ratings_tags.csv"
if os.path.exists(tags_file):
    tags_df = pd.read_csv(tags_file)
    annotated_titles = set(tags_df['title'])
else:
    tags_df = pd.DataFrame()
    annotated_titles = set()

system_prompt = """You are a movie researcher finding information about movies. 
Given a movie's title, director, and release year, return the following JSON:
{
    "female_protagonist": "Yes" or "No" (depending on whether the movie's main character is female)
    "lgbtq_protagonist": "Yes" or "No" (depending on whether the movie's main character is a member of the LGBTQ+ community)
    "white_protagonist": "Yes" or "No" (depending on whether the movie's main character is white)
    "political/social_or_moral_topic": "Name of topic" or "No" (depending on whether the movie focuses on a controversial political topic)
    "experimental": "Yes" or "No" (depending on whether the movie has an experimental style/plot)
}

If the information is not available or unclear, return "Unknown" for the given field. Only output the json without any additional text or explanation
"""

tags = tags_df.to_dict("records") if not tags_df.empty else []

for index, row in df.iterrows():
    title = row['title']
    if title in annotated_titles:
        continue  # Skip already annotated movies

    # Unpack the entire row into a dictionary
    tag_entry = row.to_dict()
    director = row['director']
    release_year = row['release_year']

    prompt = f"""

Movie Title: {title}
Director: {director}
Release Year: {release_year}

"""

    result, _ = perplexity(prompt, system_prompt)

    # Clean up the result
    result = result.replace("```", "").replace("json", "").replace("\n", "").strip()
    try:
        start_index = result.find("{")
        end_index = result.rfind("}") + 1
        result = result[start_index:end_index]
    except Exception as e:
        print(f"\n\nError extracting JSON for {title}: {e}")
    tag_entry["tags"] = result
    print(f"index: {index}, tags: {result}")

    tags.append(tag_entry)
    pd.DataFrame(tags).to_csv(tags_file, index=False)


index: 0, tags: {  "female_protagonist": "No",  "lgbtq_protagonist": "Unknown",  "white_protagonist": "Yes",  "political/social_or_moral_topic": "Corporate corruption and workers' rights",  "experimental": "No"}
index: 1, tags: {  "female_protagonist": "Yes",  "lgbtq_protagonist": "Yes",  "white_protagonist": "Yes",  "political/social_or_moral_topic": "Coming-of-age, self-identity, family, love",  "experimental": "No"}
index: 2, tags: {  "female_protagonist": "No",  "lgbtq_protagonist": "Unknown",  "white_protagonist": "Yes",  "political/social_or_moral_topic": "No",  "experimental": "No"}
index: 3, tags: {  "female_protagonist": "Yes",  "lgbtq_protagonist": "No",  "white_protagonist": "No",  "political/social_or_moral_topic": "Family abandonment and reconciliation",  "experimental": "No"}
index: 4, tags: {  "female_protagonist": "No",  "lgbtq_protagonist": "Unknown",  "white_protagonist": "Unknown",  "political/social_or_moral_topic": "Grief and childhood maturity",  "experimental": "