In [3]:
import pandas as pd
import pandas as pd
import numpy as np
import warnings
from sklearn.manifold import TSNE
import plotly.express as px
from helpers import load_data, get_embedding
import plotly.graph_objects as go
import threading
import time
from tqdm import tqdm
import json
import tiktoken

warnings.filterwarnings("ignore")

DATA_PATH = '/Users/tanguydeclety/Documents/GitHub/ada-2023-project-tada/data/tada/'

# Load the data
loaded_data = load_data(DATA_PATH)

character_metadata = loaded_data['character_metadata']
movie_metadata = loaded_data['movie_metadata']
plot_summaries = loaded_data['plot_summaries']
embeddings = loaded_data['embeddings']
combined_plot_summaries = loaded_data['combined_plot_summaries']

ModuleNotFoundError: No module named 'tiktoken'

In [None]:
from openai import OpenAI
client = OpenAI()

answers = {}

In [None]:
# We load from movie_analysis.json and convert to df
with open('movie_analysis.json', 'r') as f:
    answers = json.load(f)

In [None]:
print(len(answers))

In [None]:
system_prompt = """

         You will give some extra information about a movie given its summary.
         You will give a json string that will contain the following:
         - Cities and towns where the action takes place include every city. If there are no cities, give an empty list/
         - Countries where the action takes place. If there is no country, give an empty list.
         - Make a dictionary of main characters and give for each: nationality(if present), if the character is evil, neutral or good.
        
         If the cities/ countries are fictional and thus not real you should not include them at all. Same for nationalities.
         So for example you should not include Hogwarts as a city, or Middle Earth as a country or Panem as a country.
         
         Nationationalities should also only be the country.
         Example:

         Movie Description:
         "In 'The Heist', a skilled team of thieves plan a daring bank robbery in downtown Los Angeles. The plot thickens when they realize they are being pursued by an FBI agent determined to bring them to justice. The story is a thrilling cat-and-mouse game set against the backdrop of the bustling city." 
         {
            "cities": [
               "Los Angeles"
            ],
            "countries": [
                "USA"
            ],
            "characters": {
                "John": {
                    "nationality": "USA",
                    "alignment": "evil"
                },
                "Agent Smith": {
                    "nationality": "USA",
                    "alignment": "neutral"
                }
            }
        }
         """

In [None]:
def num_tokens_from_string(string: str, encoding_name: str = 'cl100k_base') -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
lock = threading.Lock()
processed_count = 0
input_token_count = 0
output_token_count = 0
MAX_THREADS = 10
SAVE_INTERVAL = 100
LARGE_SAVE_INTERVAL = 500
PRINT_INTERVAL = 50

def process_movie(wiki_id, movie):
    global processed_count, input_token_count, output_token_count,answers, lock
    times = 0
    while times < 3:
        try:
            with lock:
                if wiki_id in answers or str(wiki_id) in answers or int(wiki_id) in answers:
                    print("Movie already processed ", wiki_id)
                    return

            input_tokens = num_tokens_from_string(f"Analyze this movie: {movie}") + num_tokens_from_string(system_prompt)

            response = client.chat.completions.create(
                model="gpt-3.5-turbo-1106",
                response_format={"type": "json_object"},
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Analyze this movie: {movie}"}
                ],
                seed=42,
            )

            output_tokens = num_tokens_from_string(response.choices[0].message.content)

            with lock:
                answers[wiki_id] = response.choices[0].message.content
                processed_count += 1
                input_token_count += input_tokens
                output_token_count += output_tokens

                if processed_count % SAVE_INTERVAL == 0:
                    print("Saving progress...")
                    save_to_file()
                if processed_count % LARGE_SAVE_INTERVAL == 0:
                    print("Saving large progress...")
                    save_to_file(f"movie_analysis_{processed_count}.json")
                if processed_count % PRINT_INTERVAL == 0:
                    print_cost()

            break
        except Exception as e:
            print("Rate limit hit, waiting for 45 seconds...")
            time.sleep(45)
            times += 1

    if times == 3:
        print("Movie skipped ", wiki_id)
    else:
        print("Movie processed ", wiki_id)

def print_cost():
    input_cost = (input_token_count / 1000) * 0.001
    output_cost = (output_token_count / 1000) * 0.002
    total_cost = input_cost + output_cost
    print(f"Processed {processed_count} movies. Total cost so far: ${total_cost:.2f}")
    print(f"Input tokens: {input_token_count}, Output tokens: {output_token_count}")

def save_to_file(filename='movie_analysis.json'):
    global answers
    with open(filename, 'w') as file:
        file.seek(0)
        json.dump(answers, file)

def process_batch(start_index, batch_size):
    for i in range(start_index, start_index + batch_size):
        if i < len(combined_plot_summaries):
            wiki_id = combined_plot_summaries['Wikipedia movie ID'].values[i]
            movie = combined_plot_summaries['Summary'].values[i]
            process_movie(wiki_id, movie)

threads = []
num_movies = len(combined_plot_summaries)
batch_size = num_movies // MAX_THREADS

for i in range(0, num_movies, batch_size):
    thread = threading.Thread(target=process_batch, args=(i, batch_size))
    threads.append(thread)
    thread.start()

for thread in tqdm(threads):
    thread.join()

print("All movies processed.")



In [None]:
save_to_file("movie_analysis_final.json")