In [1]:
#pip install --upgrade huggingface_hub

In [39]:
# !pip install -U accelerate
# !pip install -U transformers
# !pip install openai==0.28
# !pip install datasets

In [1]:
import os
# import subprocess
# from huggingface_hub import hf_hub_download

from datasets import load_dataset, Dataset
#from google.colab import drive
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import openai
import numpy as np
import pandas as pd

In [2]:
os.environ["HF_TOKEN"] = ""
os.environ["OPENAI_API_KEY"] = ""
# env_vars = os.environ
# for key, value in env_vars.items():
#     print(f"{key}: {value}")

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cpu


In [4]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
dataset = load_dataset("nguha/legalbench", 'cuad_audit_rights', trust_remote_code=True)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'index', 'text', 'document_name'],
        num_rows: 6
    })
    test: Dataset({
        features: ['answer', 'index', 'text', 'document_name'],
        num_rows: 1216
    })
})

In [7]:
# Convert to DataFrame for easier handling
df_train = pd.DataFrame(dataset['test'])
df_test = pd.DataFrame(dataset['train'])

# Preprocess text
df_train['cleaned_text'] = df_train['text'].apply(lambda text: text.strip().lower())
df_test['cleaned_text'] = df_test['text'].apply(lambda text: text.strip().lower())

#To split the data better
df_combined = pd.concat([df_train, df_test])
df_combined.drop(columns=['index'])

# Shuffle the data
df_combined_shuffled = df_combined.sample(frac=1).reset_index(drop=True)
df_combined_shuffled.drop(columns=['index'])

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(df_combined_shuffled, test_size=0.2, stratify = df_combined_shuffled['answer'])
val_data, test_data = train_test_split(test_data, test_size=0.4, stratify = test_data['answer'])

print(f"Training set size: {train_data.shape}")
print(f"Validation set size: {val_data.shape}")
print(f"Test set size: {test_data.shape}")

Training set size: (977, 5)
Validation set size: (147, 5)
Test set size: (98, 5)


In [10]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=2)
model.to(device)

# Tokenize the inputs
train_encodings = tokenizer(train_data['cleaned_text'].tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_data['cleaned_text'].tolist(), truncation=True, padding=True, max_length=512)

# Convert labels to tensor
train_labels = torch.tensor(train_data['answer'].apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())
val_labels = torch.tensor(val_data['answer'].apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Create dataset class
class LegalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = LegalDataset(train_encodings, train_labels)
val_dataset = LegalDataset(val_encodings, val_labels)

In [12]:
def train_and_evaluate(train_texts, train_labels, val_texts, val_labels, model, tokenizer):

    # Tokenize the inputs
    train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)

    # Convert labels to tensor
    train_labels = torch.tensor(train_labels.apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())
    val_labels = torch.tensor(val_labels.apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())

    # Create datasets
    train_dataset = LegalDataset(train_encodings, train_labels)
    val_dataset = LegalDataset(val_encodings, val_labels)

    training_args = TrainingArguments(
        output_dir='/results',
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch"
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = val_dataset,
    )

    trainer.train()
    predictions = trainer.predict(val_dataset)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return accuracy, precision, recall, f1

In [11]:
# Note:
# We used k-fold cross-validation to ensure that the model's performance is robust and generalizable.
# K-fold cross-validation splits the data into 'k' subsets, trains the model on 'k-1' subsets, and validates it on the remaining subset.
# This process is repeated 'k' times with different subsets as the validation set each time.
# By averaging the results, we get a more reliable estimate of the model's performance and reduce the risk of overfitting,
# as the model is validated on different data segments in each fold.

kf = StratifiedKFold(n_splits=5)
accuracies, precisions, recalls, f1s = [], [], [], []
texts = train_data['cleaned_text']
labels = train_data['answer']
i=0
for train_index, val_index in kf.split(texts, labels):
    train_texts = texts[texts.index.isin(train_index)]
    val_texts = texts[texts.index.isin(val_index)]
    train_labels = labels[labels.index.isin(train_index)]
    val_labels = labels[labels.index.isin(val_index)]

    #Initializing a new model
    model = BertForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=2)
    tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
    model.to(device)

    accuracy, precision, recall, f1 = train_and_evaluate(train_texts, train_labels, val_texts, val_labels, model, tokenizer)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

    model.save_pretrained('fine-tuned-legal-bert-fold'+str(i))
    tokenizer.save_pretrained('fine-tuned-legal-bert-fold'+str(i))
    i+=1

# Print average metrics
print(f"Average Accuracy: {sum(accuracies) / len(accuracies)}")
print(f"Average Precision: {sum(precisions) / len(precisions)}")
print(f"Average Recall: {sum(recalls) / len(recalls)}")
print(f"Average F1 Score: {sum(f1s) / len(f1s)}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1129,0.115321
2,0.2319,0.081328
3,0.0643,0.030517


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0771,0.023682
2,0.0011,0.001537
3,0.0002,0.00159


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1084,0.013612
2,0.0008,0.112874
3,0.1696,0.091538


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1033,0.069218
2,0.013,0.071755
3,0.0004,0.145785


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1801,0.046897
2,0.0007,0.16164
3,0.0054,0.227348


Average Accuracy: 0.9832269069518975
Average Precision: 0.9796253273141206
Average Recall: 0.9872941176470589
Average F1 Score: 0.9832096174486324


In [13]:
# Define the test function
def test_model(test_texts, test_labels, model_path, tokenizer_path):
    # Load the fine-tuned model and tokenizer
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    model.to(device)

    # Tokenize the test texts
    test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

    # Convert labels to tensor
    test_labels_tensor = torch.tensor(test_labels.apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())

    # Create a test dataset
    test_dataset = LegalDataset(test_encodings, test_labels_tensor)

    # Create a DataLoader for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

    # Evaluate the model
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['labels'].cpu().numpy())
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return accuracy, precision, recall, f1

In [14]:
# Load your test data
test_texts = test_data['cleaned_text']
test_labels = test_data['answer']

# Iterate over the saved models and evaluate them
for i in range(5):  # Assuming you have 5 models
    model_path = f'fine-tuned-legal-bert-fold{i}'
    tokenizer_path = f'fine-tuned-legal-bert-fold{i}'

    accuracy, precision, recall, f1 = test_model(test_texts, test_labels, model_path, tokenizer_path)
    print(f"Model {i} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

accuracy, precision, recall, f1 = test_model(test_texts, test_labels,'nlpaueb/legal-bert-base-uncased', 'nlpaueb/legal-bert-base-uncased')
print(f"Model (untrained Legal-BERT) - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
     

Model 0 - Accuracy: 0.9897959183673469, Precision: 0.98, Recall: 1.0, F1 Score: 0.98989898989899
Model 1 - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0
Model 2 - Accuracy: 0.9183673469387755, Precision: 1.0, Recall: 0.8367346938775511, F1 Score: 0.9111111111111111
Model 3 - Accuracy: 0.9591836734693877, Precision: 0.9591836734693877, Recall: 0.9591836734693877, F1 Score: 0.9591836734693877
Model 4 - Accuracy: 0.9693877551020408, Precision: 0.9423076923076923, Recall: 1.0, F1 Score: 0.9702970297029703


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model (untrained Legal-BERT) - Accuracy: 0.5, Precision: 0.5, Recall: 0.02040816326530612, F1 Score: 0.0392156862745098


In [19]:
# Load model
model = BertForSequenceClassification.from_pretrained("C:\\Users\\soria\\Desktop\\fine-tuned-legal-bert-fold1")
tokenizer = BertTokenizer.from_pretrained("C:\\Users\\soria\\Desktop\\fine-tuned-legal-bert-fold1")
model.to(device)
     

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [20]:
# Function for classification using Legal-BERT
def classify_clause_legal_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.item()

# Define a test clause
test_clause = "Each party shall cause any subsidiary or other affiliate (including, without limitation, a subsidiary or other affiliate of the online group or skype group, as applicable) to grant to the other party the audit rights granted hereunder with respect to such other party."

# Get the combined result
response = classify_clause_legal_bert(test_clause)

# Print the combined result
print(response)

1


In [56]:
# Set up OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Get the list of models available
models_available = openai.Model.list()

# Loop through and print the model IDs
for model in models_available['data']:
    print(model['id'])

gpt-4-turbo
gpt-4-turbo-2024-04-09
tts-1
tts-1-1106
chatgpt-4o-latest
dall-e-2
gpt-4-turbo-preview
gpt-4o-mini
gpt-4o-mini-2024-07-18
gpt-3.5-turbo-instruct
gpt-4-0125-preview
gpt-3.5-turbo-0125
gpt-3.5-turbo
babbage-002
davinci-002
gpt-4o-realtime-preview-2024-10-01
dall-e-3
gpt-4o-realtime-preview
gpt-4o-2024-05-13
tts-1-hd
gpt-4o
tts-1-hd-1106
gpt-4-1106-preview
text-embedding-ada-002
gpt-3.5-turbo-16k
text-embedding-3-small
gpt-4o-2024-08-06
text-embedding-3-large
whisper-1
gpt-3.5-turbo-1106
gpt-4-0613
gpt-4
gpt-3.5-turbo-instruct-0914


In [69]:
def run_riskAnalysis(clause):
    # Risk Analysis using GPT-4o
    risk_template = "You are a legal advisor. Identify any potential risks in the clauses given to you."
    prompt = clause
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": risk_template},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Define a test clause
test_clause = "Each party shall cause any subsidiary or other affiliate (including, without limitation, a subsidiary or other affiliate of the online group or skype group, as applicable) to grant to the other party the audit rights granted hereunder with respect to such other party."

# Get the combined result
response = run_riskAnalysis(test_clause)

# Print the combined result
print(response)
     

The clause presented contains several potential risks that could arise during its execution:

1. **Scope of Affiliates**: The clause seems to broadly include subsidiaries and other affiliates, potentially even those that are not directly relevant to the contract. This broad scope could lead to practical difficulties and disagreements about which entities are included. Further clarification and limitation might be necessary to avoid confusion and ensure compliance.

2. **Control Over Affiliates**: Parties may not have sufficient control over their affiliates to enforce such obligations effectively. Not all affiliates may be willing or able to comply with the audit requirements, which could lead to breaches of the agreement and potential legal disputes.

3. **Confidentiality and Privacy Concerns**: Auditing rights might lead to exposure of sensitive, confidential, or proprietary information of subsidiaries or affiliates. There must be safeguards such as Non-Disclosure Agreements (NDAs) t

In [70]:
# Function to run the prompt using the ChatCompletion endpoint
def run_gpt_integration(classification_label, risk_analysis, clause):
    prompt = (
        f"Here is a contract clause that has been classified as '{classification_label}':\n\n"
        f"'{clause}'\n\n"
        f"The potential risks identified in this clause are:\n{risk_analysis}\n\n"
    )
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a legal advisor. Please provide an integrated, cohesive explanation of this clause, its classification, and the identified risks. Provide the respone in the following template:"},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

In [71]:
# Define a combined function
def classify_and_analyze_clause(clause):
    classification_result = classify_clause_legal_bert(clause)
    classification_label = "Audit Clause" if classification_result == 1 else "Not an Audit Clause"
    risk_analysis = run_riskAnalysis(clause)
    integrated_response = run_gpt_integration(classification_label, risk_analysis, clause)
    return integrated_response

# Define a test clause
test_clause = "Each party shall cause any subsidiary or other affiliate (including, without limitation, a subsidiary or other affiliate of the online group or skype group, as applicable) to grant to the other party the audit rights granted hereunder with respect to such other party."

# Get the combined result
response = classify_and_analyze_clause(test_clause)

# Print the combined result
print(response)

**Clause Explanation:**

The presented clause is an "Audit Clause," focusing on granting audit rights between contracting parties and their respective subsidiaries or affiliates. The clause mandates that each party ensure their subsidiaries or affiliates provide similar audit rights as those stated in the contract, without limiting those rights to specific terms or conditions.

**Clause Classification:**

The clause is classified as an 'Audit Clause' because it pertains to the right to examine, verify, and ensure compliance with contractual obligations by one party upon another's subsidiaries or affiliates. It potentially involves financial, procedural, or operational inspections, as allowed by the broader scope of the agreement.

**Identified Risks and Explanations:**

1. **Ambiguity and Broad Scope**: This clause lacks specificity in defining the exact audit rights, which could result in differing interpretations and conflicts over its implementation. A dispute might arise if one par

In [72]:
# Combined function to classify and analyze a clause
def classify_and_analyze_clause(clause):
    try:
        # Classify the clause
        classification_result = classify_clause_legal_bert(clause)
        classification_label = "Audit Clause" if classification_result == 1 else "Not an Audit Clause"
        
        # Perform risk analysis
        risk_analysis = run_riskAnalysis(clause)
        
        # Generate an integrated response
        integrated_response = run_gpt_integration(classification_label, risk_analysis, clause)
        
        return integrated_response
    
    except Exception as e:
        return f"An error occurred: {e}"

# Test clause to classify and analyze
test_clause = "Each party shall cause any subsidiary or other affiliate (including, without limitation, a subsidiary or other affiliate of the online group or skype group, as applicable) to grant to the other party the audit rights granted hereunder with respect to such other party."

# Get and print the combined result
response = classify_and_analyze_clause(test_clause)
print(response)


### Explanation of the Clause:

The given contract clause is classified as an "Audit Clause". It requires each party involved in the contract to ensure that their subsidiaries or affiliates grant the other party the same audit rights as those outlined in the contract. Essentially, this clause aims to extend the audit rights from the primary parties involved to include their respective subsidiaries and affiliates, ensuring a wider scope for auditing.

### Classification:

This is an "Audit Clause," which typically outlines the rights and procedures by which one party can inspect, verify, or review the accounts, operations, processes, or records of the other party. Such clauses are used in contracts to ensure transparency, compliance, and accountability, often in financial, data handling, or compliance contexts.

### Identified Risks:

1. **Lack of Specificity**: The clause does not clearly define the scope and limitations of the audit rights. Without clear boundaries, this could lead to