## Configure Azure Key Vault and OpenAI Credentials

Securely retrieve OpenAI API key from Azure Key Vault for authentication.
This ensures sensitive credentials are not hardcoded in the notebook.

In [1]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
import os

def get_openai_key():
    """Retrieve OpenAI API key from Azure Key Vault"""
    try:
        # Initialize the Azure credentials
        credential = DefaultAzureCredential()
        
        # Create a secret client
        vault_url = f"https://kvrunithesis.vault.azure.net/"
        secret_client = SecretClient(vault_url=vault_url, credential=credential)
        
        # Get the secret
        secret = secret_client.get_secret("alon-thesis-openai-key")
        
        # Set as environment variable
        os.environ["OPENAI_API_KEY"] = secret.value
        
        print("Successfully retrieved OpenAI API key from Azure Key Vault")
    except Exception as e:
        print(f"Error retrieving secret from Key Vault: {str(e)}")
        raise

# Retrieve and set the OpenAI API key
get_openai_key()

# Now you can initialize the OpenAI client which will automatically use the environment variable

Successfully retrieved OpenAI API key from Azure Key Vault


In [2]:
import pandas as pd

# Load the test data
csv_file = r'C:\Users\orgrd\workspace\data\patentmatch_test\patentmatch_test_no_claims.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,claim_id,patent_application_id,cited_document_id,text,text_b,label,date
0,5113165,5113165,111187_0,EP3157302A1,EP2903333,A network of handling a paging procedure in a ...,FIG.16 is a diagram illustrating an example of...,0,20170419
1,5658863,5658863,209068_1,EP3202314A1,EP2229880,A sensor information processing program for ca...,In a first step the fundamental movement frequ...,1,20170809
2,5584990,5584990,171472_0,EP3196007A1,EP2939828,A moulded trim part for a vehicle according to...,It was found that the thermoplastic polyuretha...,0,20170726
3,5137320,5137320,87572_0,EP3160147A1,EP1670252,A method for fast channel change characterized...,As to the issue of delivery modes the strategy...,0,20170426
4,5800528,5800528,204115_0,EP3217403A1,EP1855216,An audio asset information storage system comp...,Further it is assumed in the above circumstanc...,0,20170913


## Prepare JSONL Files for OpenAI Processing

This section prepares the data for batch processing with OpenAI's API. Here's what we're doing:

1. **Setup**: Import required libraries and configure logging
2. **Data Model**: Define a Pydantic model `NegationResponse` to validate OpenAI's responses
3. **Batch Processing**: 
   - Split data into batches of 1000 rows each
   - Create JSONL files with proper OpenAI API format
   - Each line contains:
     - Custom ID for tracking
     - API endpoint
     - Request body with messages and response format
4. **Output**: Save batches as separate JSONL files in `output_jsonl` directory

The JSONL format is required for OpenAI's batch processing endpoint.

In [4]:
# Prepare the openai required jsonl files
from functools import partial
import json
import logging
from pathlib import Path
from typing import List, Optional
from pydantic import BaseModel
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the NegationResponse model
class NegationResponse(BaseModel):
    negation_present: bool
    negation_types: Optional[List[str]]
    short_explanation: str

# Create output directory
output_dir = Path('output_jsonl')
output_dir.mkdir(exist_ok=True)

# Process in smaller batches
batch_size = 1000
num_batches = len(df) // batch_size + 1

def create_jsonl_line(row, column):
    text = row[column]
    messages = [
        {"role": "system", "content": "Analyze the text for negations and identify their types."},
        {"role": "user", "content": f"Analyze the following text: {text}"}
    ]
    
    body = {
        "model": "gpt-4-turbo-preview",
        "messages": messages,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "negation_response",
                "schema": NegationResponse.model_json_schema()
            }
        },
        "max_tokens": 500
    }
    
    return {
        "custom_id": f'request_{column}_{row["patent_application_id"]}_{row["index"]}',
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": body
    }

# Process batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_df = df.iloc[start_idx:end_idx]
    
    print(f"Processing batch {i + 1}/{num_batches}")
    lines = batch_df.apply(create_jsonl_line, axis=1, args=('text',))
    
    with open(output_dir / f"batch_{i}.jsonl", "w", encoding='utf-8') as f:
        for line in lines:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

# Show sample output
print("\nSample output from first batch:")
with open(output_dir / "batch_0.jsonl", "r", encoding='utf-8') as f:
    print(f.readline())


Processing batch 1/373
Processing batch 2/373
Processing batch 3/373
Processing batch 4/373
Processing batch 5/373
Processing batch 6/373
Processing batch 7/373
Processing batch 8/373
Processing batch 9/373
Processing batch 10/373
Processing batch 11/373
Processing batch 12/373
Processing batch 13/373
Processing batch 14/373
Processing batch 15/373
Processing batch 16/373
Processing batch 17/373
Processing batch 18/373
Processing batch 19/373
Processing batch 20/373
Processing batch 21/373
Processing batch 22/373
Processing batch 23/373
Processing batch 24/373
Processing batch 25/373
Processing batch 26/373
Processing batch 27/373
Processing batch 28/373
Processing batch 29/373
Processing batch 30/373
Processing batch 31/373
Processing batch 32/373
Processing batch 33/373
Processing batch 34/373
Processing batch 35/373
Processing batch 36/373
Processing batch 37/373
Processing batch 38/373
Processing batch 39/373
Processing batch 40/373
Processing batch 41/373
Processing batch 42/373
P

In [None]:
import os
import json
import asyncio
import aiohttp
from pathlib import Path
from datetime import datetime
import openai

# Set your OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')
if not openai.api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

# Define the directory containing your JSONL batch files
batch_files_dir = Path('path/to/your/batch_files')

# Create a directory for tracking OpenAI file metadata
tracking_dir = batch_files_dir / "tracking"
tracking_dir.mkdir(parents=True, exist_ok=True)

async def upload_and_track_file(session, jsonl_path):
    """Upload a JSONL file to OpenAI and track its metadata asynchronously."""
    url = 'https://api.openai.com/v1/files'
    headers = {
        'Authorization': f"Bearer {openai.api_key}",
    }
    form_data = aiohttp.FormData()
    form_data.add_field('purpose', 'batch')  # Set purpose to 'batch' for batch processing
    form_data.add_field('file', jsonl_path.open('rb'), filename=jsonl_path.name, content_type='application/jsonl')

    async with session.post(url, headers=headers, data=form_data) as response:
        if response.status == 200:
            response_json = await response.json()
            tracking_info = {
                "file_id": response_json['id'],
                "original_filename": jsonl_path.name,
                "status": response_json['status'],
                "created_at": response_json['created_at'],
                "upload_timestamp": datetime.now().isoformat(),
                "bytes": response_json['bytes'],
                "purpose": response_json['purpose']
            }
            # Save tracking info
            tracking_file = tracking_dir / f"{response_json['id']}_metadata.json"
            with open(tracking_file, 'w') as f:
                json.dump(tracking_info, f, indent=2)
            return tracking_info
        else:
            error_text = await response.text()
            print(f"Error uploading {jsonl_path.name}: {error_text}")
            return None

async def main_2():
    """Main function to upload new batch files asynchronously."""
    print("\nUploading new batch files to OpenAI...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for batch_file in sorted(batch_files_dir.glob("batch_*.jsonl")):
            print(f"Uploading {batch_file.name}...")
            tasks.append(upload_and_track_file(session, batch_file))
            await asyncio.sleep(1)  # Brief delay to avoid rate limits

        uploaded_files = await asyncio.gather(*tasks)

        # Filter out None values in case of upload failures
        uploaded_files = [file for file in uploaded_files if file]

        # Save summary of all uploads
        summary_file = tracking_dir / "upload_summary.json"
        with open(summary_file, 'w') as f:
            json.dump({
                "upload_timestamp": datetime.now().isoformat(),
                "total_files": len(uploaded_files),
                "files": uploaded_files
            }, f, indent=2)

        print(f"\nUploaded {len(uploaded_files)} files to OpenAI")
        print(f"Tracking information saved to {tracking_dir}")

await main_2()

Uploading batch_0.jsonl...
Uploading batch_1.jsonl...
Uploading batch_10.jsonl...
Uploading batch_100.jsonl...
Uploading batch_101.jsonl...
Uploading batch_102.jsonl...
Uploading batch_103.jsonl...
Uploading batch_104.jsonl...
Uploading batch_105.jsonl...
Uploading batch_106.jsonl...
Uploading batch_107.jsonl...
Uploading batch_108.jsonl...
Uploading batch_109.jsonl...
Uploading batch_11.jsonl...
Uploading batch_110.jsonl...
Uploading batch_111.jsonl...
Uploading batch_112.jsonl...
Uploading batch_113.jsonl...
Uploading batch_114.jsonl...
Uploading batch_115.jsonl...
Uploading batch_116.jsonl...
Uploading batch_117.jsonl...
Uploading batch_118.jsonl...
Uploading batch_119.jsonl...
Uploading batch_12.jsonl...
Uploading batch_120.jsonl...
Uploading batch_121.jsonl...
Uploading batch_122.jsonl...
Uploading batch_123.jsonl...
Uploading batch_124.jsonl...
Uploading batch_125.jsonl...
Uploading batch_126.jsonl...
Uploading batch_127.jsonl...
Uploading batch_128.jsonl...
Uploading batch_129.j

TypeError: ClientSession._request() got an unexpected keyword argument 'files'

## Process OpenAI Batch Requests

This section handles batch processing with OpenAI, including:
1. Reading file IDs from tracking directory
2. Managing batch submissions (max 50 concurrent batches)
3. Tracking progress and handling errors
4. Retrying failed requests
5. Saving results as they arrive

In [None]:
import os
import json
import asyncio
import aiohttp
from pathlib import Path
from datetime import datetime

# Define the output directory
output_dir = Path('output_jsonl')

# Create a directory for tracking OpenAI file metadata
tracking_dir = output_dir / "tracking"
tracking_dir.mkdir(parents=True, exist_ok=True)

async def upload_and_track_file(session, jsonl_path):
    """Upload a JSONL file to OpenAI and track its metadata asynchronously."""
    url = 'https://api.openai.com/v1/files'
    headers = {
        'Authorization': f"Bearer {os.getenv('OPENAI_API_KEY')}",
    }
    data = aiohttp.FormData()
    data.add_field('purpose', 'fine-tune')  # 'fine-tune' is the current valid purpose
    data.add_field('file', open(jsonl_path, 'rb'), filename=jsonl_path.name, content_type='application/jsonl')

    async with session.post(url, headers=headers, data=data) as response:
        if response.status == 200:
            response_json = await response.json()
            tracking_info = {
                "file_id": response_json['id'],
                "original_filename": jsonl_path.name,
                "status": response_json['status'],
                "created_at": response_json['created_at'],
                "upload_timestamp": datetime.now().isoformat(),
                "bytes": response_json['bytes'],
                "purpose": response_json['purpose']
            }
            # Save tracking info
            tracking_file = tracking_dir / f"{response_json['id']}_metadata.json"
            with open(tracking_file, 'w') as f:
                json.dump(tracking_info, f, indent=2)
            return tracking_info
        else:
            error_text = await response.text()
            print(f"Error uploading {jsonl_path.name}: {error_text}")
            return None

async def main_2():
    """Main function to upload all batch files asynchronously."""
    async with aiohttp.ClientSession() as session:
        tasks = []
        for batch_file in sorted(output_dir.glob("batch_*.jsonl")):
            print(f"Uploading {batch_file.name}...")
            tasks.append(upload_and_track_file(session, batch_file))
            await asyncio.sleep(1)  # Brief delay to avoid rate limits

        uploaded_files = await asyncio.gather(*tasks)

        # Filter out None values in case of upload failures
        uploaded_files = [file for file in uploaded_files if file]

        # Save summary of all uploads
        summary_file = tracking_dir / "upload_summary.json"
        with open(summary_file, 'w') as f:
            json.dump({
                "upload_timestamp": datetime.now().isoformat(),
                "total_files": len(uploaded_files),
                "files": uploaded_files
            }, f, indent=2)

        print(f"\nUploaded {len(uploaded_files)} files to OpenAI")
        print(f"Tracking information saved to {tracking_dir}")

# Run the main function within the existing event loop
await main_2()
