In [3]:
#!/usr/bin/env python
# coding: utf-8

# In[5]:


from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
import torch
import pandas as pd
import re
from tqdm import tqdm
from torch.cuda.amp import autocast
import numpy as np
import os
import torch
import torch.nn as nn
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)
from sklearn.model_selection import train_test_split





# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}")

base_model_name = "meta-llama/Llama-3.1-8B"



model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16", 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id


df = pd.read_csv('filtered_labeled.csv')

def clean_text(text):
    # Remove newline-separated letters
    cleaned_text = re.sub(r'(\n[a-zA-Z])', '', text)
    
    # Remove single newlines or extraneous spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'11.', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r"\n\'", ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'~', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\n1', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'~{2}', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\({2}', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\){2}', ' ', cleaned_text).strip()
    return cleaned_text

df['text'] = df['text'].apply(clean_text)

df = df.sample(frac=1, random_state=85).reset_index(drop=True)

system_prompt = "You are a helpful annotator for a dataset of legal contracts between entities in Iowa state."
pre_condition = "The class name is "

train_end = int(0.8 * len(df))
eval_end = train_end + int(0.1 * len(df))

# Split the data
X_train = df[:train_end]
print(len(X_train))
X_eval = df[train_end:eval_end]
print(len(X_eval))
X_test = df[eval_end:]
print(len(X_test))

# Categories and prompts
class_names = [
    'joint operations', 'new joint entities', 'resource sharing', 'service contract']

# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
Given this contract :
{data_point["text"]}

"You are a contract classification assistant. Your task is to classify the contract text "
        "into one of the predefined categories. Here are the criteria for each category:\n"
        "- Joint Operations: Partnership arrangements to jointly produce services with one or more organizations.\n"
        "- New Joint Entities: Two or more organizations creating a separate new entity to manage or govern a shared asset or service.\n"
        "- Resource Sharing: Sharing of information, personnel, equipment, etc., between governments or community organizations to provide services.\n"
        "- Service Contracts: Agreements with outside entities, public or private, for provision or support services.\n"
        "Analyze the given text carefully and respond with the appropriate category."

"""
#            Classify the text into {class_names} and return the answer as the govermental contract label. Give me your best guess.
#text: {data_point["text"]}
#label: {data_point["Institutional_Form"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
Given this contract :
{data_point["text"]}

"You are a contract classification assistant. Your task is to classify the contract text "
        "into one of the predefined categories. Here are the criteria for each category:\n"
        "- Joint Operations: Partnership arrangements to jointly produce services with one or more organizations.\n"
        "- New Joint Entities: Two or more organizations creating a separate new entity to manage or govern a shared asset or service.\n"
        "- Resource Sharing: Sharing of information, personnel, equipment, etc., between governments or community organizations to provide services.\n"
        "- Service Contracts: Agreements with outside entities, public or private, for provision or support services.\n"
        "Analyze the given text carefully and respond with the appropriate category."

"""
#            Classify the text into {class_names} and return the answer as the govermental contract label. Give me your best guess.
#text: {data_point["text"]}
#label: """.strip()




# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'Institutional_Form']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])



# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=50, 
                        temperature=0.1)

def predict(test, model, tokenizer):
    y_pred = []
    categories = ['joint operations', 'new joint entities', 'resource sharing', 'service contract']
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        #print(prompt)
        
        
        
        result = pipe(prompt)
        #print(result)
        input_length = len(prompt)
        #answer = result#[input_length:]
        answer = result[0]['generated_text'][len(prompt):].strip() #.split("label:")[-1].strip()
        #print(answer)
        
        # Determine the predicted category
        for category in categories:
            if category.lower() in str(answer).lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

y_pred = predict(X_test, model, tokenizer)



def evaluate(y_true, y_pred):
    labels = ['joint operations', 'new joint entities', 'resource sharing', 'service contract']

    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

evaluate(y_true, y_pred)



Running on cuda


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.38s/it]


498
62
63


 16%|█▌        | 10/63 [00:16<01:16,  1.44s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 63/63 [01:31<00:00,  1.45s/it]

Accuracy: 0.063
Accuracy for label joint operations: 0.200
Accuracy for label new joint entities: 0.000
Accuracy for label resource sharing: 0.000
Accuracy for label service contract: 0.026

Classification Report:
                    precision    recall  f1-score   support

  joint operations       0.20      0.20      0.20        15
new joint entities       0.00      0.00      0.00         6
  resource sharing       0.00      0.00      0.00         4
  service contract       1.00      0.03      0.05        38

         micro avg       0.25      0.06      0.10        63
         macro avg       0.30      0.06      0.06        63
      weighted avg       0.65      0.06      0.08        63


Confusion Matrix:
[[3 0 0 0]
 [2 0 0 0]
 [1 0 0 0]
 [9 0 0 1]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
import gc
import torch


# Clear GPU memory (if applicable)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()


In [2]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate
import re
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


# Load and preprocess dataset
df = pd.read_csv('filtered_labeled.csv')

def clean_text(text):
    cleaned_text = re.sub(r'(\n[a-zA-Z])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

df['text'] = df['text'].apply(clean_text)

df['Institutional_Form'] = df['Institutional_Form'].astype('category')
df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

category_map = {code: category for code, category in enumerate(df['Institutional_Form'].cat.categories)}

df_train, df_val = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)

def generate_features_with_prompt(df):
    instruction = (
        "You are a contract classification assistant. Your task is to classify the contract text "
        "into one of the predefined categories. Here are the criteria for each category:\n"
        "- Joint Operations: Partnership arrangements to jointly produce services with one or more organizations.\n"
        "- New Joint Entities: Two or more organizations creating a separate new entity to manage or govern a shared asset or service.\n"
        "- Resource Sharing: Sharing of information, personnel, equipment, etc., between governments or community organizations to provide services.\n"
        "- Service Contracts: Agreements with outside entities, public or private, for provision or support services.\n"
        "Analyze the given text carefully and respond with the appropriate category."
    )
    df['Institutional_Form'] = df['Institutional_Form'].astype(str)
    df['input'] = (
        instruction
        + "\n\nContract Text: "
        + df['text']
        + "\n\nInstitutional Form Category: "
        + df['Institutional_Form']
    )
    return df

generate_features_with_prompt(df_train)
generate_features_with_prompt(df_val)

dataset_train = Dataset.from_pandas(df_train.reset_index(drop=True))
dataset_val = Dataset.from_pandas(df_val.reset_index(drop=True))

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
})

# Load pre-trained model
model_name = "meta-llama/Llama-3.1-8B"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16, 
    bnb_4bit_use_double_quant=True
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=len(category_map),
    device_map="auto"  # Automatically distribute model layers across GPUs
)
# Update the model configuration
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False  # Ensure compatibility with fine-tuning

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

def llama_preprocessing_function(examples):
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("Institutional_Form_category", "label")
tokenized_datasets.set_format("torch")

# Collate function
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

# Prediction and evaluation
def make_predictions(model, df):
    # Ensure padding token is defined
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    sentences = df.input.tolist()

    batch_size = 32  # Adjust based on system capacity
    all_outputs = []

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])

    final_outputs = torch.cat(all_outputs, dim=0)
    df['predictions'] = final_outputs.argmax(axis=1).cpu().numpy()

    return df


def get_performance_metrics(df):
    y_test = df.Institutional_Form_category
    y_pred = df.predictions
    print(f"comparing test {y_test} and pred {y_pred}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

df_val = make_predictions(model, df_val)
get_performance_metrics(df_val)


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.26s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 498/498 [00:00<00:00, 1606.78 examples/s]
Map: 100%|██████████| 125/125 [00:00<00:00, 1601.25 examples/s]


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    1
558    3
174    0
280    1
110    1
      ..
6      1
104    3
114    1
355    1
132    1
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[ 2  9  5  5]
 [ 1 13  5  2]
 [ 0  5  2  3]
 [ 5 38  6 24]]

Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.10      0.14        21
           1       0.20      0.62      0.30        21
           2       0.11      0.20      0.14        10
           3       0.71      0.33      0.45        73

    accuracy                           0.33       125
   macro avg       0.32      0.31      0.26       125
weighted avg       0.50      0.33      0.35       125

Balanced Accuracy Score: 0.3107632093933464
Accuracy Score: 0.328


In [4]:
import pandas as pd
df = pd.read_csv('filtered_labeled.csv')


df['Institutional_Form'] = df['Institutional_Form'].astype('category')
df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes
category_map = {code: category for code, category in enumerate(df['Institutional_Form'].cat.categories)}

In [5]:
category_map

{0: 'joint operations',
 1: 'new joint entities',
 2: 'resource sharing',
 3: 'service contract'}