### Description:
This notebook guides how to use Batch API from OpenAI to process asynchornus request, lower cost purpose and decrease rate limit

In [4]:
import json
from openai import OpenAI
import pandas as pd
from IPython.display import Image, display

### 1. Init OpenAI Client

In [5]:
# Initializing OpenAI client - see https://platform.openai.com/docs/quickstart?context=python
client = OpenAI(api_key='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')

### 2. Load data:

In [9]:
dataset_path = "./imdb_top_1000.csv"

df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


### 3. Processing Step - categorize and describe

In [10]:
categorize_system_prompt = '''
Your goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.
You will be provided with a movie description, and you will output a json object containing the following information:

{
    categories: string[] // Array of categories based on the movie description,
    summary: string // 1-sentence summary of the movie based on the movie description
}

Categories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.
Movies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.
'''

def get_categories(description):
    response = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0.1,
    # This is to enable JSON mode, making sure responses are valid json objects
    response_format={ 
        "type": "json_object"
    },
    messages=[
        {
            "role": "system",
            "content": categorize_system_prompt
        },
        {
            "role": "user",
            "content": description
        }
    ],
    )

    return response.choices[0].message.content

### 4. Test on few examples:

In [11]:
# Testing on a few examples
for _, row in df[:5].iterrows():
    description = row['Overview']
    title = row['Series_Title']
    result = get_categories(description)
    print(f"TITLE: {title}\nOVERVIEW: {description}\n\nRESULT: {result}")
    print("\n\n----------------------------\n\n")

TITLE: The Shawshank Redemption
OVERVIEW: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.

RESULT: {
    "categories": ["drama"],
    "summary": "Two imprisoned men develop a deep bond over the years, ultimately finding solace and redemption through their shared acts of decency."
}


----------------------------


TITLE: The Godfather
OVERVIEW: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.

RESULT: {
    "categories": ["crime", "drama"],
    "summary": "An aging crime lord hands over his empire to his hesitant son amidst the challenges of organized crime."
}


----------------------------


TITLE: The Dark Knight
OVERVIEW: When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.

RESULT: {
    "categories": ["action", "th

### 5. Creating the Batch file - jsonl format

In [12]:
# Creating an array of json tasks

tasks = []

for index, row in df.iterrows():
    
    description = row['Overview']
    
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": "gpt-4o-mini",
            "temperature": 0.1,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": categorize_system_prompt
                },
                {
                    "role": "user",
                    "content": description
                }
            ],
        }
    }
    
    tasks.append(task)

In [13]:
tasks

[{'custom_id': 'task-0',
  'method': 'POST',
  'url': '/v1/chat/completions',
  'body': {'model': 'gpt-4o-mini',
   'temperature': 0.1,
   'response_format': {'type': 'json_object'},
   'messages': [{'role': 'system',
     'content': '\nYour goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.\nYou will be provided with a movie description, and you will output a json object containing the following information:\n\n{\n    categories: string[] // Array of categories based on the movie description,\n    summary: string // 1-sentence summary of the movie based on the movie description\n}\n\nCategories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.\nMovies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.\n'},
    {'role': 'user',
     'content': 'Two i

In [14]:
# Creating the file

file_name = "./batch_tasks_movies.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

### 6. Upload batch file to OpenAI:

In [15]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [16]:
print(batch_file)

FileObject(id='file-Qh7oPmd3RS5QwSzURkuoU5', bytes=1125310, created_at=1747185536, filename='batch_tasks_movies.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


### 8. Running on OpenAI

In [17]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

### 7. Ultilities:

In [12]:
# get status batch job: 
batch_job = client.batches.retrieve('batch_6823ef8317cc8190a03c4c94a463ec10')
print(batch_job)

Batch(id='batch_6823ef8317cc8190a03c4c94a463ec10', completion_window='24h', created_at=1747185539, endpoint='/v1/chat/completions', input_file_id='file-Qh7oPmd3RS5QwSzURkuoU5', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747271939, failed_at=None, finalizing_at=None, in_progress_at=1747185541, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=851, failed=0, total=1000))


In [11]:
result_file_id = batch_job.output_file_id
result_file_id

In [9]:
# get status batch job
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

ValueError: Expected a non-empty value for `file_id` but received None

In [None]:
# save this file:
result_file_name = "data/batch_job_results_movies.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [6]:
# Loading data from saved file
results = []
with open('/workspace/competitions/Sly/RAG_Traffic_Law_experiment/duy/notebook/batch_api_gpt/batch_682331c6ff3c8190b3d363668c3e4e49_error.jsonl', 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [8]:
results

[{'id': 'batch_req_682348f7e1a481909bbbd7b33de3d511',
  'custom_id': 'task-0',
  'response': {'status_code': 403,
   'request_id': '70a0ac620e98a4b39a6bc1d4b6670429',
   'body': {'error': {'message': 'Project `proj_vCir8w9KNlY46aEYwkemDpHP` does not have access to model `gpt-4o-mini-2024-07-18-batch`',
     'type': 'invalid_request_error',
     'param': None,
     'code': 'model_not_found'}}},
  'error': None},
 {'id': 'batch_req_682348f7fe508190b3fa01a6e0985920',
  'custom_id': 'task-1',
  'response': {'status_code': 403,
   'request_id': 'a210d31b737630d8a3762caf7b6f9a92',
   'body': {'error': {'message': 'Project `proj_vCir8w9KNlY46aEYwkemDpHP` does not have access to model `gpt-4o-mini-2024-07-18-batch`',
     'type': 'invalid_request_error',
     'param': None,
     'code': 'model_not_found'}}},
  'error': None},
 {'id': 'batch_req_682348f81cd081909f3d59c55f61d142',
  'custom_id': 'task-2',
  'response': {'status_code': 403,
   'request_id': '1e8ee3a687705ecf0d85c7c933b06595',
   

In [7]:
# Reading only the first results
for res in results[:5]:
    task_id = res['custom_id']
    # Getting index from task id
    index = task_id.split('-')[-1]
    result = res['response']['body']['choices'][0]['message']['content']
    movie = df.iloc[int(index)]
    description = movie['Overview']
    title = movie['Series_Title']
    print(f"TITLE: {title}\nOVERVIEW: {description}\n\nRESULT: {result}")
    print("\n\n----------------------------\n\n")

KeyError: 'choices'