In [11]:
# %pip install jsonlines

In [1]:
import json

def clean_span(span):
    """
    Cleans the given span string by ensuring balanced quotes and removing unwanted commas.

    Args:
        span (str): The span string to clean.

    Returns:
        str: The cleaned span string.
    """
    span = span.strip()
    if span.endswith('"],') or span.endswith('"]'):
        span = span.rstrip('"],').rstrip('"]') + '"'
    return span

def parse_attribute_spans(response_str):
    """
    Parses the attribute spans from the given string and returns a dictionary.

    Args:
        response_str (str): The response string containing the attribute spans.

    Returns:
        dict: A dictionary with attributes as keys and lists of strings as values.
    """
    attributes = ["Cinematography", "Direction", "Story", "Characters", "Production Design", "Unique Concept", "Emotions"]
    spans = {attr: [] for attr in attributes}

    # Remove the surrounding curly braces and split the response into lines
    lines = response_str.strip()[1:-1].strip().splitlines()
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if ":" in line:
            parts = line.split(":", 1)
            attr = parts[0].strip()
            if attr in attributes:
                # Remove the surrounding square brackets and split by comma
                span_list_str = parts[1].strip().strip("[]").strip()
                if span_list_str:
                    # Split by comma but handle cases where there might be commas within strings
                    spans[attr] = [clean_span(item.strip().strip('"').strip()) for item in span_list_str.split('", ') if item.strip().strip('"').strip() and item != ',']
    
    return spans

In [2]:
## Labeling Dataset
import pandas as pd
from openai import OpenAI
import csv
from dotenv import load_dotenv
import os, json

load_dotenv()

apikey = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=apikey)

df = pd.read_csv("../data/IMDB Dataset.csv")
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df = df[9500:]
reviews = list(df["review"][9500:])
df

Unnamed: 0,review,sentiment
9516,"This isn't ""so bad it's good""--It's ""so bad, i...",negative
9517,"I'm a fan of the horror movie, regardless of w...",negative
9518,"maybe i need to have my head examined,but i th...",positive
9519,"Larry Buchanan. Yep, same guy who did ""Attack ...",negative
9520,"Ritchie's first two films were snappy, stylish...",negative
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Batch Request to GPT-4o-mini

In [3]:
import pandas as pd
from openai import OpenAI
import csv
from dotenv import load_dotenv
import os, json

load_dotenv()

apikey = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=apikey)

sys_prompt = """
You are an assistant who gives specific attributes. The attributes are Cinematography, Direction, Story, Characters, Production Design, Unique Concept, and Emotions. 

YOUR INPUT WOULD BE LIKE THIS:
Review: "The cinematography was stunning, but the story was weak. I loved the movie. There wasn't anything unique in the movie. characters could've been better tho."

YOU MUST FOLLOW THE OUTPUT FORMAT GIVEN BELOW. DON'T WRITE ANYTHING ELSE:
{
Cinematography: [list of strings with chunks where cinematography is discussed],
Direction: [list of strings with chunks where direction is discussed],
Story: [list of strings with chunks where story is discussed],
Characters: [list of strings with chunks where characters are discussed],
Production Design: [list of strings with chunks where production design is discussed],
Unique Concept: [list of strings with chunks where unique concept is discussed],
Other: [list of strings that mentions other things related to movie]
}

if something is not discussed, add empty list infront of it.

"""

# Define the attributes
attributes = ["Cinematography", "Direction", "Story", "Characters", "Production Design", "Unique Concept", "Emotions"]



def get_sentiment_spans(review):
    prompt = "Label the following review below:" + f"\nReview: {review}\n"
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": prompt}
        ]
    )

    response = completion.choices[0].message.content.strip()
    spans = parse_attribute_spans(response)
    return spans

def parse_attribute_spans(response):
    try:
        spans = json.loads(response)
    except json.JSONDecodeError:
        spans = {attr: [] for attr in attributes}
    return spans

def label_dataset_and_save(sentences, batch_size, output_file, checkpoint_file):
    start_index = 0

    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            start_index = int(file.read().strip())

    requests = []
    for i in range(start_index, len(sentences)):
        sentence = sentences[i]
        prompt = f"Label the following review below:\nReview: {sentence}\n"
        request = {
            "custom_id": f"request-{i+1}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "messages": [
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": 1000
            }
        }
        requests.append(request)

        if (i + 1) % batch_size == 0 or (i + 1) == len(sentences):
            with open(output_file, 'a') as file:
                for req in requests:
                    json.dump(req, file)
                    file.write('\n')
            requests = []
            with open(checkpoint_file, 'w') as file:
                file.write(str(i + 1))
            print(f"Processed and saved {i + 1} sentences")


output_file = '../data/spans_batch_requests.jsonl'
checkpoint_file = 'checkpoint.txt'
batch_size = 40082  # Adjust batch size as needed

label_dataset_and_save(reviews, batch_size, output_file, checkpoint_file)


### Count Tokens (Approx) per request

In [5]:
import tiktoken
import jsonlines


# Initialize the encoding for GPT-4
encoding = tiktoken.get_encoding("cl100k_base")

# Open the .jsonl file
with jsonlines.open('../data/spans_batch_requests.jsonl') as reader:
    total_tokens = 0
    for obj in reader:
        # Assuming each object has 'review' and 'label' keys
        example_sentence = f"""{obj}"""

        # Combine the prompt and the example sentence
        full_prompt = example_sentence

        # Encode the text to get the number of tokens
        num_tokens = len(encoding.encode(full_prompt))
        total_tokens += num_tokens

print(f"Total number of tokens used in the file: {total_tokens}")


Total number of tokens used in the file: 18926661


### 2. Uploading Your Batch Input File
Similar to our Fine-tuning API, you must first upload your input file so that you can reference it correctly when kicking off batches. Upload your .jsonl file using the Files API.

### 3. Creating the Batch
Once you've successfully uploaded your input file, you can use the input File object's ID to create a batch. In this case, let's assume the file ID is file-abc123. For now, the completion window can only be set to 24h. You can also provide custom metadata via an optional metadata parameter.


In [1]:

from openai import OpenAI
client = OpenAI()

batch_input_file = client.files.create(
  file=open("../data/spans_batch_requests.jsonl", "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "lm6 spans job"
    }
)

### Monitor and Retrieve

In [3]:
from openai import OpenAI
client = OpenAI()

client.batches.retrieve("batch_Bd91ong")

In [2]:
client.batches.cancel("batch_Bd91ong")

In [None]:
from openai import OpenAI
client = OpenAI()

file_response = client.files.content("file-xyz123")
print(file_response.text)