In [42]:
import json
from openai import OpenAI
from datasets import load_dataset
from config import OPEN_AI_TOKEN
from config import PROJECT_DIR
from general_utils.reader import JSONLineReader
from tqdm import tqdm
from general_utils.utils import parse_model_answer

In [2]:
client = OpenAI(api_key=OPEN_AI_TOKEN)

In [76]:
dataset = load_dataset("lukasellinger/german_claim_verification_dissim-v1").get('train')

dataset[:5]

Downloading readme:   0%|          | 0.00/585 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 95.3k/95.3k [00:00<00:00, 165kB/s]


Generating train split:   0%|          | 0/710 [00:00<?, ? examples/s]

{'id': [1, 6, 7, 8, 9],
 'word': ['Liebe', 'Liebe', 'Liebe', 'Liebe', 'Liebe'],
 'english_word': ['Love', 'Love', 'Love', 'Love', 'Love'],
 'label': ['SUPPORTED', 'SUPPORTED', 'SUPPORTED', 'SUPPORTED', 'SUPPORTED'],
 'claim': ['inniges Gefühl der Zuneigung für jemanden oder für etwas',
  'sexuell oder erotisch motivierte Neigung zu jemandem oder zu einer Sache',
  'Geschlechtsakt; Akt der körperlichen Vereinigung in Folge von 2',
  'Liebschaft',
  'innige und gefühlsbetonte Beziehung zu einer Sache, einer Idee, einem Ziel oder Ähnlichem'],
 'english_claim': ['a deep feeling of affection for someone or for something',
  'sexually or erotically motivated inclination towards someone or something',
  'sexual act; act of physical union in succession of 2',
  'Love',
  'intimate and emotional relationship to a thing, an idea, a goal or similar'],
 'context_sentence': ['Die Liebe überwindet alle Grenzen.',
  'homosexuelle Liebe, lesbische Liebe',
  'Sie machten Liebe, ohne an die Konsequenzen

In [33]:
def get_prediction(claim, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
    model=model,
    temperature=0.1,
    messages=[
        {
            "role": "user",
            "content": f'Input: {claim} True or False?\nOutput:'
        }
    ],
    )

    return response.choices[0].message.content

In [77]:
# Testing on a few examples
for entry in dataset.select(range(3)):
    word = entry.get('word')
    claim = f"{word}: {entry['claim']}"
    result = get_prediction(claim)
    print(f"WORD: {word}\nOVERVIEW: {claim}\n\nRESULT: {result}")
    print("\n\n----------------------------\n\n")

WORD: Liebe
OVERVIEW: Liebe: inniges Gefühl der Zuneigung für jemanden oder für etwas

RESULT: True


----------------------------


WORD: Liebe
OVERVIEW: Liebe: sexuell oder erotisch motivierte Neigung zu jemandem oder zu einer Sache

RESULT: True


----------------------------


WORD: Liebe
OVERVIEW: Liebe: Geschlechtsakt; Akt der körperlichen Vereinigung in Folge von 2

RESULT: False


----------------------------




In [34]:
fh = JSONLineReader()

lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    claim = f"{word}: {entry['claim']}"
    generated_answer = get_prediction(claim, model='gpt-4o').lower()
    predicted = parse_model_answer(generated_answer)
    
    lines.append({
        'id': entry['id'],
        'word': entry['word'],
        'claim': claim,
        'predicted': predicted,
        'label': entry['label']
    })
        
fh.write('output_gpt4o.jsonl', lines)

100%|██████████| 168/168 [02:24<00:00,  1.17it/s]


# Create Task

In [78]:
# Creating an array of json tasks
model = "gpt-3.5-turbo"
tasks = []

for idx, entry in enumerate(dataset):
    word = entry['word']
    claim = f"{word}: {entry['claim']}"
    task = {
        "custom_id": f"task-{idx}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": model,
            "temperature": 0.1,
            "messages": [
                {
                    "role": "user",
                    "content": f'Input: {claim} True or False?\nOutput:'
                }
            ],
        }
    }
    
    tasks.append(task)

In [79]:
# Creating the file
file_name = PROJECT_DIR.joinpath('dataset/openai/batch_german_contextless-gpt3_5-turbo.jsonl')

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [80]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [81]:
print(batch_file)

FileObject(id='file-VWp7f9KvRBKSXmr5XLo6XG4C', bytes=205971, created_at=1720519670, filename='batch_german_contextless-gpt3_5-turbo.jsonl', object='file', purpose='batch', status='processed', status_details=None)


# Creating batch job

In [82]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [85]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

Batch(id='batch_2Eg6lPJHvJq3dBlDPrzrfYuv', completion_window='24h', created_at=1720519726, endpoint='/v1/chat/completions', input_file_id='file-VWp7f9KvRBKSXmr5XLo6XG4C', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1720519855, error_file_id=None, errors=None, expired_at=None, expires_at=1720606126, failed_at=None, finalizing_at=1720519832, in_progress_at=1720519727, metadata=None, output_file_id='file-5YD9A0YKqQY84gixUTATBTvE', request_counts=BatchRequestCounts(completed=710, failed=0, total=710))


# Retrieving results

In [86]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [87]:
result_file_name = "batch_german_job_results.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [88]:
# Loading data from saved file
results = JSONLineReader().read(result_file_name)

In [90]:
lines = []
for res in results:
    task_id = res['custom_id']
    # Getting index from task id
    index = task_id.split('-')[-1]
    entry = dataset[int(index)]
    claim = entry['claim']
    word = entry['word']
    generated_answer = res['response']['body']['choices'][0]['message']['content'].lower()
    predicted = parse_model_answer(generated_answer)

    lines.append({
        'id': entry['id'],
        'word': entry['word'],
        'claim': claim,
        'predicted': predicted,
        'label': entry['label']
    })
    
fh.write('output.jsonl', lines)