In [None]:
import pandas as pd

# Load the test data
csv_file = r'C:\Users\orgrd\workspace\data\patentmatch_test\patentmatch_test_no_claims.csv'
df = pd.read_csv(csv_file)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,claim_id,patent_application_id,cited_document_id,text,text_b,label,date
0,5113165,5113165,111187_0,EP3157302A1,EP2903333,A network of handling a paging procedure in a ...,FIG.16 is a diagram illustrating an example of...,0,20170419
1,5658863,5658863,209068_1,EP3202314A1,EP2229880,A sensor information processing program for ca...,In a first step the fundamental movement frequ...,1,20170809
2,5584990,5584990,171472_0,EP3196007A1,EP2939828,A moulded trim part for a vehicle according to...,It was found that the thermoplastic polyuretha...,0,20170726
3,5137320,5137320,87572_0,EP3160147A1,EP1670252,A method for fast channel change characterized...,As to the issue of delivery modes the strategy...,0,20170426
4,5800528,5800528,204115_0,EP3217403A1,EP1855216,An audio asset information storage system comp...,Further it is assumed in the above circumstanc...,0,20170913


In [None]:
# Prepare the openai required jsonl files
from functools import partial
import json
import logging
from pathlib import Path
from typing import List, Optional
from pydantic import BaseModel
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the NegationResponse model
class NegationResponse(BaseModel):
    negation_present: bool
    negation_types: Optional[List[str]]
    short_explanation: str

# Create output directory
output_dir = Path('output_jsonl')
output_dir.mkdir(exist_ok=True)

# Process in smaller batches
batch_size = 1000
num_batches = len(df) // batch_size + 1

def create_jsonl_line(row, column):
    text = row[column]
    messages = [
        {"role": "system", "content": "Analyze the text for negations and identify their types."},
        {"role": "user", "content": f"Analyze the following text: {text}"}
    ]
    
    body = {
        "model": "gpt-4-turbo-preview",
        "messages": messages,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "negation_response",
                "schema": NegationResponse.model_json_schema()
            }
        },
        "max_tokens": 500
    }
    
    return {
        "custom_id": f'request_{column}_{row["patent_application_id"]}_{row["index"]}',
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": body
    }

# Process batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_df = df.iloc[start_idx:end_idx]
    
    print(f"Processing batch {i + 1}/{num_batches}")
    lines = batch_df.apply(create_jsonl_line, axis=1, args=('text',))
    
    with open(output_dir / f"batch_{i}.jsonl", "w", encoding='utf-8') as f:
        for line in lines:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

# Show sample output
print("\nSample output from first batch:")
with open(output_dir / "batch_0.jsonl", "r", encoding='utf-8') as f:
    print(f.readline())
