# Contract Clause Classification System using DSPy

This notebook implements an advanced contract clause classification system using DSPy and Azure OpenAI. The system employs chain-of-thought reasoning and zero-shot learning to identify specific clauses in legal contracts.

## Zero-Shot Learning with Chain-of-Thought
This implementation uses:
1. Chain-of-thought reasoning for detailed analysis
2. Zero-shot learning without requiring training examples
3. Legal-specific prompting with weighted metrics
4. MiPro optimization for improved performance

In [64]:
import os 
import dspy
import pandas as pd
from dotenv import dotenv_values, load_dotenv
from openai import AzureOpenAI

# Load environment variables
load_dotenv()

# Azure OpenAI Configuration
azure_endpoint = os.getenv("AZURE_OPENAI_API_EASTUS_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_EASTUS_API_KEY")
deployment = 'gpt-4o-mini-eastus-0718'

# Initialize DSPy with Azure OpenAI
turbo = dspy.AzureOpenAI(
    api_key=api_key,
    api_version="2024-06-01",
    api_base=azure_endpoint,
    model=deployment
)

# Configure DSPy
dspy.configure(lm=turbo)

In [65]:
class ContractAnalyzer(dspy.Signature):
    context = dspy.InputField(desc="The contract text to analyze")
    analysis = dspy.OutputField(desc="Step-by-step analysis of contract structure")
    conclusion = dspy.OutputField(desc="Final summary of identified sections")

class ChainOfThoughtClassifier(dspy.Signature):
    context = dspy.InputField(desc="The contract text to classify")
    clause_type = dspy.InputField(desc="The type of clause to identify")
    reasoning = dspy.OutputField(desc="Step-by-step legal analysis of the clause presence")
    decision = dspy.OutputField(desc="Final classification (Present/Absent) with justification")

In [66]:
class ContractPipeline(dspy.Module):
    def __init__(self):
        super().__init__()
        self.analyzer = dspy.Predict(ContractAnalyzer)
        self.classifier = dspy.Predict(ChainOfThoughtClassifier)
    
    def forward(self, contract_text, clause_type):
        # First, analyze contract structure
        analysis = self.analyzer(context=contract_text)
        
        # Then perform classification with reasoning
        result = self.classifier(
            context=contract_text,
            clause_type=clause_type
        )
        
        return result.decision, result.reasoning

In [77]:
from dspy import Example
from dspy.teleprompt import MIPROv2

class MIPROv2Optimizer:
    def __init__(self, model):
        self.model = model

    def optimize(self, pipeline):
        # Define metric function BEFORE using it
        def metric_fn(example, pred, trace=None):
            correct = int(pred['decision'].lower() == example.outputs['gold'].lower())
            reasoning_quality = len(pred['reasoning'].split('.')) / 5.0  # Normalize by expected length
            reasoning_quality = min(1.0, reasoning_quality)
            return 0.7 * correct + 0.3 * reasoning_quality

        # Initialize MIPROv2 optimizer for zero-shot optimization
        optimizer = MIPROv2(
            metric=metric_fn,
            auto="light",            # Choose between "light", "medium", or "heavy"
            max_bootstrapped_demos=0,
            max_labeled_demos=0,     # Zero-shot learning
        )

        # Wrapper function to convert dictionaries to Example instances
        def create_examples(valset_dicts):
            examples = []
            for example_dict in valset_dicts:
                example = Example().with_inputs({
                    'context': example_dict['context'],
                    'clause_type': example_dict['clause_type']
                }).with_outputs({
                    'gold': example_dict['gold']
                })
                examples.append(example)
            return examples

        # Load and prepare validation data
        import pandas as pd
        df = pd.read_csv('contracts_advanced/contract_labels.csv')
        df.columns = [col.lower() for col in df.columns]  # If lowercase is necessary
        valset = []
        for _, row in df.iterrows():
            with open(f"{row['filename']}", 'r') as f:
                contract_text = f.read()
            for clause, column in {
                "Non-Disclosure Agreement (NDA) clause": "contains_nda",
                "Termination Clause": "contains_termination",
                "Indemnity Clause": "contains_indemnity",
                "Force Majeure Clause": "contains_force_majeure",
                "Data Protection Clause": "contains_data_protection"
            }.items():
                if row[column] != "Unknown":
                    valset.append({
                        'context': contract_text,
                        'clause_type': clause,
                        'gold': row[column]
                    })

        # Use a small subset of valset as trainset
        trainset = valset[:5]  # Use the first 5 examples as the training set

        # Convert the valset and trainset to DSPy Example objects
        valset = create_examples(valset)
        trainset = create_examples(trainset)

        # Perform optimization with MIPROv2
        optimized_pipeline = optimizer.compile(
            student=pipeline,
            trainset=trainset,
            valset=valset,
            requires_permission_to_run=False
        )

        return optimized_pipeline


In [78]:
# Instantiate and optimize the pipeline
pipeline = ContractPipeline()
optimizer = MIPROv2Optimizer(deployment)
optimized_pipeline = optimizer.optimize(pipeline)

# Test with model deployments
deployments = {
    "gpt-4o-mini": "gpt-4o-mini-eastus-0718",
    "gpt-4o": "gpt-4o-eastus-0806"
}

for model_name, deployment_name in deployments.items():
    print(f"\nTesting with {model_name}...")

    # Configure DSPy model
    turbo = dspy.AzureOpenAI(
        api_key=api_key,
        api_version="2024-06-01",
        api_base=azure_endpoint,
        model=deployment_name
    )
    dspy.configure(lm=turbo)

    # Test classification
    with open(df['filename'].iloc[0], 'r') as f:
        test_contract = f.read()

    for clause in ["Non-Disclosure Agreement (NDA) clause", "Termination Clause"]:
        result = optimized_pipeline(test_contract, clause)
        print(f"\n{clause}")
        print(f"Classification: {result['decision']}")
        print(f"Reasoning: {result['reasoning']}")


TypeError: Example.with_inputs() got an unexpected keyword argument 'input1'