## Configure Azure Key Vault and OpenAI Credentials

Securely retrieve OpenAI API key from Azure Key Vault for authentication.
This ensures sensitive credentials are not hardcoded in the notebook.

In [None]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
import os

def get_openai_key():
    """Retrieve OpenAI API key from Azure Key Vault"""
    try:
        # Initialize the Azure credentials
        credential = DefaultAzureCredential()
        
        # Create a secret client
        vault_url = f"https://kvrunithesis.vault.azure.net/"
        secret_client = SecretClient(vault_url=vault_url, credential=credential)
        
        # Get the secret
        secret = secret_client.get_secret("alon-thesis-openai-key")
        
        # Set as environment variable
        os.environ["OPENAI_API_KEY"] = secret.value
        
        print("Successfully retrieved OpenAI API key from Azure Key Vault")
    except Exception as e:
        print(f"Error retrieving secret from Key Vault: {str(e)}")
        raise

# Retrieve and set the OpenAI API key
get_openai_key()

# Now you can initialize the OpenAI client which will automatically use the environment variable

Error retrieving secret from Key Vault: (Forbidden) Caller is not authorized to perform action on resource.
If role assignments, deny assignments or role definitions were changed recently, please observe propagation time.
Caller: appid=04b07795-8ddb-461a-bbee-02f9e1bf7b46;oid=ff0986ac-24f6-4899-99ab-1c4912044461;iss=https://sts.windows.net/ccf0ebdc-92f9-40ea-af1c-5b37d32cf8a8/
Action: 'Microsoft.KeyVault/vaults/secrets/getSecret/action'
Resource: '/subscriptions/8592e500-3312-4991-9d2a-2b97e43b1810/resourcegroups/rgrunithesis/providers/microsoft.keyvault/vaults/kvorgr/secrets/alon-thesis-openai-key'
Assignment: (not found)
DenyAssignmentId: null
DecisionReason: null 
Vault: kvorgr;location=eastus

Code: Forbidden
Message: Caller is not authorized to perform action on resource.
If role assignments, deny assignments or role definitions were changed recently, please observe propagation time.
Caller: appid=04b07795-8ddb-461a-bbee-02f9e1bf7b46;oid=ff0986ac-24f6-4899-99ab-1c4912044461;iss=ht

HttpResponseError: (Forbidden) Caller is not authorized to perform action on resource.
If role assignments, deny assignments or role definitions were changed recently, please observe propagation time.
Caller: appid=04b07795-8ddb-461a-bbee-02f9e1bf7b46;oid=ff0986ac-24f6-4899-99ab-1c4912044461;iss=https://sts.windows.net/ccf0ebdc-92f9-40ea-af1c-5b37d32cf8a8/
Action: 'Microsoft.KeyVault/vaults/secrets/getSecret/action'
Resource: '/subscriptions/8592e500-3312-4991-9d2a-2b97e43b1810/resourcegroups/rgrunithesis/providers/microsoft.keyvault/vaults/kvorgr/secrets/alon-thesis-openai-key'
Assignment: (not found)
DenyAssignmentId: null
DecisionReason: null 
Vault: kvorgr;location=eastus

Code: Forbidden
Message: Caller is not authorized to perform action on resource.
If role assignments, deny assignments or role definitions were changed recently, please observe propagation time.
Caller: appid=04b07795-8ddb-461a-bbee-02f9e1bf7b46;oid=ff0986ac-24f6-4899-99ab-1c4912044461;iss=https://sts.windows.net/ccf0ebdc-92f9-40ea-af1c-5b37d32cf8a8/
Action: 'Microsoft.KeyVault/vaults/secrets/getSecret/action'
Resource: '/subscriptions/8592e500-3312-4991-9d2a-2b97e43b1810/resourcegroups/rgrunithesis/providers/microsoft.keyvault/vaults/kvorgr/secrets/alon-thesis-openai-key'
Assignment: (not found)
DenyAssignmentId: null
DecisionReason: null 
Vault: kvorgr;location=eastus

Inner error: {
    "code": "ForbiddenByRbac"
}

In [None]:
import pandas as pd

# Load the test data
csv_file = r'C:\Users\orgrd\workspace\data\patentmatch_test\patentmatch_test_no_claims.csv'
df = pd.read_csv(csv_file)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,claim_id,patent_application_id,cited_document_id,text,text_b,label,date
0,5113165,5113165,111187_0,EP3157302A1,EP2903333,A network of handling a paging procedure in a ...,FIG.16 is a diagram illustrating an example of...,0,20170419
1,5658863,5658863,209068_1,EP3202314A1,EP2229880,A sensor information processing program for ca...,In a first step the fundamental movement frequ...,1,20170809
2,5584990,5584990,171472_0,EP3196007A1,EP2939828,A moulded trim part for a vehicle according to...,It was found that the thermoplastic polyuretha...,0,20170726
3,5137320,5137320,87572_0,EP3160147A1,EP1670252,A method for fast channel change characterized...,As to the issue of delivery modes the strategy...,0,20170426
4,5800528,5800528,204115_0,EP3217403A1,EP1855216,An audio asset information storage system comp...,Further it is assumed in the above circumstanc...,0,20170913


## Prepare JSONL Files for OpenAI Processing

This section prepares the data for batch processing with OpenAI's API. Here's what we're doing:

1. **Setup**: Import required libraries and configure logging
2. **Data Model**: Define a Pydantic model `NegationResponse` to validate OpenAI's responses
3. **Batch Processing**: 
   - Split data into batches of 1000 rows each
   - Create JSONL files with proper OpenAI API format
   - Each line contains:
     - Custom ID for tracking
     - API endpoint
     - Request body with messages and response format
4. **Output**: Save batches as separate JSONL files in `output_jsonl` directory

The JSONL format is required for OpenAI's batch processing endpoint.

In [None]:
# Prepare the openai required jsonl files
from functools import partial
import json
import logging
from pathlib import Path
from typing import List, Optional
from pydantic import BaseModel
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the NegationResponse model
class NegationResponse(BaseModel):
    negation_present: bool
    negation_types: Optional[List[str]]
    short_explanation: str

# Create output directory
output_dir = Path('output_jsonl')
output_dir.mkdir(exist_ok=True)

# Process in smaller batches
batch_size = 1000
num_batches = len(df) // batch_size + 1

def create_jsonl_line(row, column):
    text = row[column]
    messages = [
        {"role": "system", "content": "Analyze the text for negations and identify their types."},
        {"role": "user", "content": f"Analyze the following text: {text}"}
    ]
    
    body = {
        "model": "gpt-4-turbo-preview",
        "messages": messages,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "negation_response",
                "schema": NegationResponse.model_json_schema()
            }
        },
        "max_tokens": 500
    }
    
    return {
        "custom_id": f'request_{column}_{row["patent_application_id"]}_{row["index"]}',
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": body
    }

# Process batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_df = df.iloc[start_idx:end_idx]
    
    print(f"Processing batch {i + 1}/{num_batches}")
    lines = batch_df.apply(create_jsonl_line, axis=1, args=('text',))
    
    with open(output_dir / f"batch_{i}.jsonl", "w", encoding='utf-8') as f:
        for line in lines:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

# Show sample output
print("\nSample output from first batch:")
with open(output_dir / "batch_0.jsonl", "r", encoding='utf-8') as f:
    print(f.readline())


Processing batch 1/373
Processing batch 2/373
Processing batch 3/373
Processing batch 4/373
Processing batch 5/373
Processing batch 6/373
Processing batch 7/373
Processing batch 8/373
Processing batch 9/373
Processing batch 10/373
Processing batch 11/373
Processing batch 12/373
Processing batch 13/373
Processing batch 14/373
Processing batch 15/373
Processing batch 16/373
Processing batch 17/373
Processing batch 18/373
Processing batch 19/373
Processing batch 20/373
Processing batch 21/373
Processing batch 22/373
Processing batch 23/373
Processing batch 24/373
Processing batch 25/373
Processing batch 26/373
Processing batch 27/373
Processing batch 28/373
Processing batch 29/373
Processing batch 30/373
Processing batch 31/373
Processing batch 32/373
Processing batch 33/373
Processing batch 34/373
Processing batch 35/373
Processing batch 36/373
Processing batch 37/373
Processing batch 38/373
Processing batch 39/373
Processing batch 40/373
Processing batch 41/373
Processing batch 42/373
P

In [None]:
import os
import json
import time
from openai import OpenAI
from datetime import datetime

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Create a directory for tracking OpenAI file metadata
tracking_dir = output_dir / "tracking"
tracking_dir.mkdir(exist_ok=True)

def upload_and_track_file(jsonl_path):
    """Upload a JSONL file to OpenAI and track its metadata"""
    # Upload file
    with open(jsonl_path, 'rb') as file:
        response = client.files.create(
            file=file,
            purpose='fine-tune'
        )
        
    # Create tracking info
    tracking_info = {
        "file_id": response.id,
        "original_filename": jsonl_path.name,
        "status": response.status,
        "created_at": response.created_at,
        "upload_timestamp": datetime.now().isoformat(),
        "bytes": response.bytes,
        "purpose": response.purpose
    }
    
    # Save tracking info
    tracking_file = tracking_dir / f"{response.id}_metadata.json"
    with open(tracking_file, 'w') as f:
        json.dump(tracking_info, f, indent=2)
        
    return tracking_info

# Upload all batch files and track their metadata
print("Uploading files to OpenAI...")
uploaded_files = []

for batch_file in sorted(output_dir.glob("batch_*.jsonl")):
    print(f"Uploading {batch_file.name}...")
    try:
        tracking_info = upload_and_track_file(batch_file)
        uploaded_files.append(tracking_info)
        print(f"Successfully uploaded {batch_file.name} - File ID: {tracking_info['file_id']}")
        
        # Wait briefly to avoid rate limits
        time.sleep(1)
    except Exception as e:
        print(f"Error uploading {batch_file.name}: {str(e)}")

# Save summary of all uploads
summary_file = tracking_dir / "upload_summary.json"
with open(summary_file, 'w') as f:
    json.dump({
        "upload_timestamp": datetime.now().isoformat(),
        "total_files": len(uploaded_files),
        "files": uploaded_files
    }, f, indent=2)

print(f"\nUploaded {len(uploaded_files)} files to OpenAI")
print(f"Tracking information saved to {tracking_dir}")


Uploading files to OpenAI...
Uploading batch_0.jsonl...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 401 Unauthorized"


Error uploading batch_0.jsonl: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-12345***********************cdef. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Uploading batch_1.jsonl...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 401 Unauthorized"


Error uploading batch_1.jsonl: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-12345***********************cdef. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Uploading batch_10.jsonl...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 401 Unauthorized"


Error uploading batch_10.jsonl: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-12345***********************cdef. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Uploading batch_100.jsonl...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 401 Unauthorized"


Error uploading batch_100.jsonl: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-12345***********************cdef. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Uploading batch_101.jsonl...


KeyboardInterrupt: 

## Process OpenAI Batch Requests

This section handles batch processing with OpenAI, including:
1. Reading file IDs from tracking directory
2. Managing batch submissions (max 50 concurrent batches)
3. Tracking progress and handling errors
4. Retrying failed requests
5. Saving results as they arrive

In [None]:
import json
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_exponential

# Constants
MAX_CONCURRENT_BATCHES = 50
MAX_RETRIES = 3
BATCH_CHECK_INTERVAL = 300  # 5 minutes
BATCH_TIMEOUT = timedelta(hours=24)

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

class BatchProcessor:
    def __init__(self, tracking_dir: Path):
        self.tracking_dir = tracking_dir
        self.results_dir = tracking_dir / "results"
        self.results_dir.mkdir(exist_ok=True)
        self.state_file = tracking_dir / "batch_state.json"
        self.load_state()

    def load_state(self):
        """Load or initialize batch processing state"""
        if self.state_file.exists():
            with open(self.state_file, 'r') as f:
                self.state = json.load(f)
        else:
            self.state = {
                "completed": {},
                "in_progress": {},
                "failed": {},
                "not_started": {},
                "last_updated": datetime.now().isoformat()
            }
            self.save_state()

    def save_state(self):
        """Save current batch processing state"""
        with open(self.state_file, 'w') as f:
            json.dump(self.state, f, indent=2)

    @retry(stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=4, max=60))
    def submit_batch(self, file_id: str) -> str:
        """Submit a batch request with retry logic"""
        try:
            response = client.completions.create(
                model="gpt-4-turbo-preview",
                file=file_id,
                purpose="negation-detection"
            )
            batch_id = response.id
            self.state["in_progress"][batch_id] = {
                "file_id": file_id,
                "start_time": datetime.now().isoformat(),
                "status": "submitted"
            }
            self.save_state()
            return batch_id
        except Exception as e:
            print(f"Error submitting batch for file {file_id}: {str(e)}")
            raise

    def check_batch_status(self, batch_id: str) -> dict:
        """Check status of a batch request"""
        try:
            response = client.completions.retrieve(batch_id)
            return {
                "status": response.status,
                "completed_at": response.completed_at if hasattr(response, 'completed_at') else None,
                "error": response.error if hasattr(response, 'error') else None
            }
        except Exception as e:
            print(f"Error checking batch {batch_id}: {str(e)}")
            return {"status": "error", "error": str(e)}

    def process_completed_batch(self, batch_id: str):
        """Process and save results from a completed batch"""
        try:
            results = client.completions.retrieve(batch_id)
            
            # Save results
            result_file = self.results_dir / f"batch_{batch_id}_results.json"
            with open(result_file, 'w') as f:
                json.dump(results.dict(), f, indent=2)
            
            # Update state
            self.state["completed"][batch_id] = self.state["in_progress"].pop(batch_id)
            self.state["completed"][batch_id]["completed_at"] = datetime.now().isoformat()
            self.save_state()
            
        except Exception as e:
            print(f"Error processing results for batch {batch_id}: {str(e)}")
            self.state["failed"][batch_id] = self.state["in_progress"].pop(batch_id)
            self.state["failed"][batch_id]["error"] = str(e)
            self.save_state()

    def run(self):
        """Main processing loop"""
        print("Starting batch processing...")
        
        # Load file IDs if not_started is empty
        if not self.state["not_started"]:
            summary_file = self.tracking_dir / "upload_summary.json"
            with open(summary_file, 'r') as f:
                summary = json.load(f)
                for file_info in summary["files"]:
                    file_id = file_info["file_id"]
                    if file_id not in (self.state["completed"] | self.state["in_progress"] | self.state["failed"]):
                        self.state["not_started"][file_id] = {"status": "pending"}
        
        while True:
            try:
                # Check in-progress batches
                for batch_id in list(self.state["in_progress"].keys()):
                    status = self.check_batch_status(batch_id)
                    
                    if status["status"] == "completed":
                        self.process_completed_batch(batch_id)
                    elif status["status"] == "failed" or status["error"]:
                        file_id = self.state["in_progress"][batch_id]["file_id"]
                        self.state["failed"][batch_id] = self.state["in_progress"].pop(batch_id)
                        self.state["not_started"][file_id] = {"status": "pending"}
                        self.save_state()
                
                # Submit new batches if under limit
                while len(self.state["in_progress"]) < MAX_CONCURRENT_BATCHES and self.state["not_started"]:
                    file_id, info = self.state["not_started"].popitem()
                    try:
                        batch_id = self.submit_batch(file_id)
                        print(f"Submitted batch for file {file_id} -> batch {batch_id}")
                    except Exception as e:
                        print(f"Failed to submit batch for file {file_id}: {str(e)}")
                        self.state["not_started"][file_id] = info
                
                # Check if we're done
                if not self.state["in_progress"] and not self.state["not_started"]:
                    print("All batches completed!")
                    break
                
                # Save state and wait
                self.save_state()
                time.sleep(BATCH_CHECK_INTERVAL)
                
            except KeyboardInterrupt:
                print("\nProcessing paused. Progress saved.")
                break
            except Exception as e:
                print(f"Unexpected error: {str(e)}")
                time.sleep(BATCH_CHECK_INTERVAL)

# Create and run the batch processor
processor = BatchProcessor(tracking_dir)
processor.run()

# Print summary
print("\nProcessing Summary:")
print(f"Completed: {len(processor.state['completed'])} batches")
print(f"Failed: {len(processor.state['failed'])} batches")
print(f"Remaining: {len(processor.state['not_started'])} batches")
