In [None]:
import os
from openai import OpenAI
import pandas as pd

os.environ["PERPLEXITY_API_KEY"] = 'key' # should be removed when publishing the code

client = OpenAI(
    api_key=os.environ["PERPLEXITY_API_KEY"],
    base_url="https://api.perplexity.ai"
)

def perplexity(prompt, system_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]

    try:
        response = client.chat.completions.create(
            model="sonar",  
            messages=messages
        )
        return response.choices[0].message.content.strip(), []  
    except Exception as e:
        print("Error:", e)
        return "", []


In [9]:
### FOR THE TAGGING ###

#df = pd.read_csv("random_movies_1000.csv") #hannes: not sure where this file is, so using the one produced by the rating_scraper.ipynb in step 3.
df = pd.read_csv("letterboxd_urls_and_ratings.csv")
first_10 = df.head(2)

system_prompt = """You are a movie researcher finding information about movies. 
Given a movie's title, director, and release year, return the following JSON:
{
    "female_protagonist": "Yes" or "No" (depending on whether the movie's main character is female)
    "lgbtq_protagonist": "Yes" or "No" (depending on whether the movie's main character is a member of the LGBTQ+ community)
    "white_protagonist": "Yes" or "No" (depending on whether the movie's main character is white)
    "political_topic": "Name of topic" or "No" (depending on whether the movie focuses on a controversial political topic)
    "experimental": "Yes" or "No" (depending on whether the movie has an experimental style/plot)
}

If the information is not available or unclear, return "Unknown" for the given field. Only output the json without any additional text or explanation
"""

tags = []

for index, row in first_10.iterrows():
    title = row['title']
    release_year = row['release_year']
    director = row['director']

    prompt = f"""

Movie Title: {title}
Director: {director}
Release Year: {release_year}

"""

    result, _ = perplexity(prompt, system_prompt)

    #remove ``` json and \n from result
    result = result.replace("```", "").replace("json", "").replace("```", "").replace("\n", "").strip()

    #extract the JSON from the result based on first and last curly braces
    try:
        start_index = result.find("{")
        end_index = result.rfind("}") + 1
        result = result[start_index:end_index]  
    except Exception as e:
        print(f"Error extracting JSON for {title}: {e}")

    #make df with title, director, release_date, and tags
    result = {
        "title": title,
        "director": director,
        "release_year": release_year,
        "tags": result
    }
    print(result)

    tags.append(result)
    tags_df = pd.DataFrame(tags)
    tags_df.to_csv("letterboxd_urls_ratings_tags.csv", index=False)


{'title': 'Time for\xa0Revenge', 'director': 'Adolfo Aristarain', 'release_year': 1981, 'tags': '{  "female_protagonist": "No",  "lgbtq_protagonist": "Unknown",  "white_protagonist": "Yes",  "political_topic": "Argentina\'s civil-military dictatorship",  "experimental": "No"}'}
{'title': 'Japanese\xa0Borscht', 'director': 'Eric Spade Rivas', 'release_year': 2019, 'tags': '{  "female_protagonist": "No",  "lgbtq_protagonist": "Unknown",  "white_protagonist": "Yes",  "political_topic": "No",  "experimental": "No"}'}
