In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
from pathlib import Path
import pandas as pd
import itertools
import tqdm
from huggingface_hub import HfApi, HfFolder
HfFolder.save_token("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")
from mlx_lm import load, generate
import re

if torch.backends.mps.is_available():
     device = torch.device("mps")
     print("Running on MPS")

Running on MPS


In [2]:
model, tokenizer = load("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [3]:
file = open('unlabeled_dataframe_with_examples.csv', 'r')
df = pd.read_csv(file)

In [4]:
def clean_text(text):
    # Remove newline-separated letters
    cleaned_text = re.sub(r'(\n[a-zA-Z])', '', text)
    
    # Remove single newlines or extraneous spaces
    cleaned_text = re.sub(r'\n+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'11.', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r"\n\'", ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\n', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'~', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\n1', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'~{2}', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\({2}', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'\){2}', ' ', cleaned_text).strip()




    
    return cleaned_text
df['OCR_Text'] = df['OCR_Text'].apply(clean_text)

In [5]:
class_names = [ 
    'Joint Operations', 
    'New Joint Entities', 
    'Resource Sharing', 
    'Grants/Other Subsidies', 
    'Joint Facilities/Co-location', 
    'Transfer of Authority', 
    'Transfer of Responsibility', 
    'Cooperative Purchasing Arrangements', 
    'Joint Facilities', 
    'Joint Production', 
    'Other', 
    'Cooperative Purchasing Agreement'
]

In [6]:
system_prompt = "You are a helpful annotator for a dataset of legal contracts between entities in Iowa state."
pre_condition = "The class name is "

In [31]:
results = []

# Limit to the first 100 rows
for index, row in tqdm.tqdm(df.iloc[12:15].iterrows(), total=38):
    unlabeled_text = row['OCR_Text']
    pdf_id = row['PDF_ID']

    # Define the input prompt for LLaMA
    input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different categories {class_names}, classify the following contract :


{unlabeled_text}


Please respond with only the predicted label from the categories.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""

    # Generate the output with the model
    outputs = generate(model, tokenizer, prompt=input_text)
    print(outputs)
    
    results.append({
                    'PDF_ID': pdf_id,
                    #'OCR_Text': unlabeled_text,
                    'Predicted_Category': outputs})

# Convert results to DataFrame if needed
results_df = pd.DataFrame(results)
print(results_df)


  3%|█▏                                          | 1/38 [00:10<06:44, 10.94s/it]

 'Joint Operations'


  5%|██▎                                         | 2/38 [00:30<09:29, 15.83s/it]

28E Master Agreement, which is a type of agreement under the Iowa Code Chapter 28E. Therefore, the predicted label is:

"28E Master Agreement"


  8%|███▍                                        | 3/38 [00:40<07:56, 13.60s/it]

28E Agreement for a Youth at Risk High School Program, which falls under the category "28E Agreement" or more specifically "Joint Operations".
    PDF_ID                                 Predicted_Category
0  L006807                                 'Joint Operations'
1  M508110  28E Master Agreement, which is a type of agree...
2  M501807  28E Agreement for a Youth at Risk High School ...





In [34]:
results

[{'PDF_ID': 'L006807', 'Predicted_Category': " 'Joint Operations'"},
 {'PDF_ID': 'M508110',
  'Predicted_Category': '28E Master Agreement, which is a type of agreement under the Iowa Code Chapter 28E. Therefore, the predicted label is:\n\n"28E Master Agreement"'},
 {'PDF_ID': 'M501807',
  'Predicted_Category': '28E Agreement for a Youth at Risk High School Program, which falls under the category "28E Agreement" or more specifically "Joint Operations".'}]

In [11]:
batch_size = 64
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[12:1000]), batch_size)):
    batch_df = df.iloc[12:1000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df = pd.DataFrame(results)
print(results_df)

  0%|                                                    | 0/16 [00:00<?, ?it/s]

 'Joint Operations'
 "Cooperative Purchasing Arrangements".
 "Joint Operations"
28E Agreement between the IOWA VALLEY COMMUNITY COLLEGE DISTRICT and the ELLSWORTH COMMUNITY COLLEGE AND ONTEZUMA COMMUNITY SCHOOL DISTRICT. 

The predicted label is: 28E Agreement
28E Agreement.
 "Joint Operations"
 "Joint Production"
28Egreement508120.
 "Cooperative Purchasing Arrangements".
 "Grants/Other Subsidies"
 "Cooperative Purchasing Arrangements".
28 E Agreement between the IOWA VALLEY COMMUNITY COLLEGE DISTRICT and the HUBBARD-RADCLIFFE COMMUNITY SCHOOL DISTRICT.
 'Joint Facilities/Co-location'.
 'Joint Facilities/Co-location'
 "Joint Facilities/Co-location".
15.8 Choice of law and forum.
10.
 "Cooperative Purchasing Arrangements".
 "Joint Facilities/Co-location".
 "Education"
28E Agreement for School-Based Supervision Programs. 

Predicted label: 28E Agreement
1.
 'Resource Sharing'.
 "Joint Operations"
 'Joint Operations'.
 'Shared Personnel Agreement'
 "Consultant/Facilitator".
 'Grants/Other

  6%|██▌                                     | 1/16 [17:20<4:20:13, 1040.93s/it]

 "Information Systems Management Academy".
 "Joint Facilities/Co-location".
28E Agreement.
 "High School Completion Program" which is a subset of "Education" under the category "Public Service". However, the most specific class name from the list that matches the content of the contract is "Joint Operations".
 "Cooperative Purchasing Arrangements"
8.1.
 "School-Based Supervision Program" and the predicted label is "School-Based Supervision Program".
 'Cooperative Purchasing Arrangements'
28E Cooperative Agreement, which is a type of Cooperative Agreement.
 "Joint Operations"
 "Grants/Other Subsidies"
15.15 Supersedes former agreements.
 'Grants/Other Subsidies'.
28E Agreement, which is a type of Cooperative Agreement. Therefore, the predicted label is "Cooperative Agreement".
 "28E Agreement" which is a type of Cooperative Purchasing Arrangement.
 "Joint Operations"
 'Joint Facilities/Co-location'.
 "Medical Laboratory Technology Program" which is a subset of "New Joint Entities".
 'Jo

 12%|█████▏                                   | 2/16 [33:05<3:49:37, 984.08s/it]

28E Agreement for School-Based Supervision Programs. 

Predicted label: 28E Agreement
 "Cooperative Purchasing Arrangements"
 "Joint Production".
 'Resource Sharing'.
 "Joint Facilities/Co-location"
 "Joint Operations"
5.5ederal Funds.
 "Joint Operations"
 "Joint Operations"
 "Joint Operations"
28E Agreement, which is a type of Cooperative Agreement. Therefore, the predicted label is "Cooperative Agreement".
 "Joint Operations"
 "Cooperative Purchasing Arrangements".
 "Cooperative Purchasing Arrangements"
 "Cooperative Purchasing Arrangements".
 "Preschool agreement" which corresponds to the class name "Preschool" or more specifically "Other" since it's a preschool program.
 'Resource Sharing'.
28E AGREEMENT FOR DEVELOPMENTAL SERVICES
 'Grants/Other Subsidies'.
 "Joint Facilities/Co-location"
 'Joint Operations'.
15.24 Additional provisions.
28E Clinical Training Agreement.
15.2 Review of J CS, or Department action.
 'Grants/Other Subsidies'.
 "Cooperative Purchasing Arrangements".
 'J

 19%|███████▋                                 | 3/16 [46:57<3:18:10, 914.67s/it]

 "Joint Operations"
28E Agreement.
 "Other"
 'Joint Operations'
 '28E Agreement'.
 "Joint Production".
 'Joint Operations'
 'Grants/Other Subsidies'.
 'Joint Operations'
 "Joint Operations".
 "Cooperative Purchasing Arrangements".
 "Cooperative Purchasing Arrangements".
28 E, which is a type of public service agreement. Based on the context, the predicted label is "Public Service Agreement".
 "High School Completion Program" which is a subset of "Education" which is a subset of "Any Education" which is a Service Code and Description in the dataset. Therefore, the predicted label is "910 Any Education (Enter only one Service Code and Description)".
28E Agreement.
 "Joint Production".
 "Assignment and Assumption Agreement" which is a type of "Transfer of Responsibility" or more specifically "Transfer of Authority" but most closely matches the class name "Transfer of Responsibility".
 "Other"
 "Joint Production".
 "Grants/Other Subsidies"
 "Cooperative Purchasing Arrangements"
 "Joint Ope

 25%|██████████▎                              | 4/16 [58:47<2:46:49, 834.09s/it]

 "Career Advantage Programs"
28E Agreement between the CITY OF WEST DES MOINES and the WEST DES MOINES SCHOOL DISTRICT. 

The predicted label is: 28E Agreement
 "Cooperative Purchasing Arrangements".
 "Cooperative Purchasing Arrangements".
 "Joint Operations"
 'Joint Production'.
 "Member".
 "Cooperative Purchasing Arrangements".
 "School Resource Officer Program" which is a subset of "Joint Operations".
 "Joint Production".
 'Joint Production'.
15.5 Drug free workplace.
 "Cooperative Purchasing Arrangements"
28 E FLEESE RONI FILED, which is not in the list. However, the predicted label is likely to be "Public Service" or "Education", as the agreement is related to a preschool program.
15.3 estrictions on use of funds.
15.13 Not a joint venture.
 "Joint Facilities/Co-location"
 "Joint Operations"
 'Joint Operations'.
28E Agreement. 

The predicted label is: Cooperative Agreement
 "Cooperative Purchasing Arrangements".
 'Joint Operations'.
 "Newborn Hearing Screening Program" which is a

 31%|████████████▏                          | 5/16 [1:13:00<2:34:07, 840.72s/it]

28E Agreement, which is a type of Cooperative Purchasing Arrangement.
 "Joint Operations" or more specifically "Joint Production" is not the correct answer, but the closest one is "Joint Operations" or "Joint Production" is not the correct answer, but the closest one is "Joint Operations" or "Joint Production" is not the correct answer, but the closest one is "Joint Operations" or "Joint Production" is not the correct answer, but the closest one is "Joint Operations" or "Joint Production" is not the correct answer, but the closest
 'KONSORTIUM AGREEMENT'.
 "Cooperative Purchasing Arrangements"
 "NORTH FAYETTE COMMUNITY SCHOOL DISTRICT BY President of Board of Directors Secretary NORTH FAYETTE COMMUNITY SCHOOL DISTRICT STATE OF IOWA FAYETTE COUNTY On this 2 l st day of September 1999, before me, the undersigned. a Notary Public in and for said State, personally appeared Thomas W. Carlson and Ann Same k to me personally known, who being by me duly sworn, did say that they are the Preside

 38%|██████████████▋                        | 6/16 [1:26:21<2:17:52, 827.29s/it]

 "Cooperative Purchasing Arrangements"
28E Agreement, which is a type of Cooperative Agreement. Therefore, the predicted label is "Cooperative Agreement".
 "Transfer of Responsibility".
28 E Agreement.
 "Cooperative Purchasing Arrangements"
 "Greement"
28E Agreement or Statewide Preschool Program for Four-Year-Old Children.
28E Agreement, which is a type of Cooperative Agreement. Therefore, the predicted label is "Cooperative Agreement".
 'Joint Production'.
 "Cooperative Purchasing Arrangements".
28E Agreement, which is a type of Cooperative Agreement. Therefore, the predicted label is: Cooperative Agreement
 "Cooperative Purchasing Arrangements"
 "Joint Production"
 'Joint Operations'
 "Joint Operations"
 "OPTIONS FOR ENROLLING IN ADJOINING STATES."
 "Article II"
 "Joint Operations".
28E Agreement, which is a type of Cooperative Purchasing Arrangement. 

The predicted label is: Cooperative Purchasing Arrangement.
 'Grants/Other Subsidies'.
15.5 Drug free workplace.
 'Joint Operations

 44%|█████████████████                      | 7/16 [1:38:54<2:00:26, 803.00s/it]

 "Cooperative Purchasing Arrangements".
 'Grants/Other Subsidies'.
 'Joint Operations'.
 'Joint Operations'
28E AGREEMENTS TO BE RECORDED
 "Joint Facilities/Co-location"
 "Cooperative Purchasing Arrangements"
 'Joint Facilities/Co-location'.
15.25 Further assurances and corrective instruments.
 "Joint Production"
 "Joint Operations"
 "Joint Production".
 "School-Based Supervision Program" and the predicted label is "School-Based Supervision Program" or "School-Based Supervision".
28 E Agreement, which is a type of Public Service agreement. Therefore, the predicted label is: 910 Any Education.
 "Cooperative Purchasing Arrangements".
 Cooperative Purchasing Arrangements.
 "28E Agreement"
 "Information Technology Program" which is a subset of "Joint Production" or more specifically "Joint Production" is a class name that includes "Information Technology Program" as a type of joint production.
10.1.
 "Joint Operations"
15.20 Time is of the essence. Time is of the essence with respect to th

 50%|███████████████████▌                   | 8/16 [1:52:31<1:47:38, 807.33s/it]

 "Cooperative Purchasing Arrangements".
28EAGREEMENT.
 'Joint Production'.
 'Joint Operations'
 "Joint Operations"
 "Cooperative Purchasing Arrangements".
 "Management Services"
 "Joint Facilities/Co-location"
15.15 Supersedes former agreements.
 'Joint Facilities/Co-location'.
 "Joint Production"
28 E Agreement for Developmental Services.
 'Grants/Other Subsidies'.
 "Joint Facilities/Co-location"
 "Interagency 28E Agreement"
 "Grants/Other Subsidies".
 "Cooperative Purchasing Arrangements".
28E Agreement, which is a type of Cooperative Purchasing Arrangements.
 "Joint Operations".
 "Cooperative Purchasing Arrangements".
 "Joint Facilities/Co-location"
28-E Agreement.
13. Cumulative Rights.
 "Joint Purchasing Arrangements"
 "Joint Operations"
9.7.
28E Agreement.
 'Whole-Grade Sharing'
 'Joint Production'.
 'Inter-Agency Sharing Agreement'
 "Joint Facilities/Co-location"
 "Joint Production".
 'Cooperative Purchasing Arrangements'.
 "Cooperative Purchasing Arrangements".
 'Joint Producti

 56%|█████████████████████▉                 | 9/16 [2:03:38<1:29:04, 763.48s/it]

 'Joint Operations'
 'Joint Operations'.
15.14 Obligations beyond the term of this contract.
 "Cooperative Purchasing Arrangements".
 "Cooperative Purchasing Arrangements".
 "Joint Operations"
 "Joint Operations".
28 E Agreement.
 "Cooperative Purchasing Arrangements"
 'Joint Operations'.
 'Grants/Other Subsidies'.
 'Joint Production'.
28E Agreement.
 "Grants/Other Subsidies"
 'Cooperative Purchasing Arrangements'
 "Joint Operations"
 "Career Advantage Program"
 "Solid Waste and Recycling Education Position"
 "Cooperative Purchasing Arrangements".
 'Grants/Other Subsidies'.
 'Grants/Other Subsidies'.
 "Education"
 "Cooperative Purchasing Arrangements".
28E Agreement, which is a type of Public Service included in this agreement is: 910 Any Education ( Enter only one Service Code and Description) Code Number Service Description. Therefore, the predicted label is 910.
28E Agreement.
 "Joint Facilities/Co-location"
15.8 Choice of law and forum.
28E Agreement between the Iowa Valley Communi

 62%|███████████████████████▊              | 10/16 [2:16:40<1:16:56, 769.38s/it]

 'Joint Operations'
 "Joint Facilities/Co-location"
 "Bridges for Learning in Applied Science & Technology (B.L.A.S.T.)" which is a subset of "Secondary Education Programs" and more specifically "Any Education" under the Public Service code 910. Therefore, the predicted label is: 910
28 E AGREEMENT.
 "Cooperative Purchasing Arrangements".
 "Cooperative Purchasing Arrangements".
 "Other"
 "Cooperative Purchasing Arrangements".
 "Joint Operations"
 "Cooperative Purchasing Arrangements".
 "Joint Production".
 "Joint Operations"
 'Joint Operations'.
28E Agreement, which is a type of Cooperative Agreement. Therefore, the predicted label is "Cooperative Agreement".
 'Joint Operations'
 "Cooperative Purchasing Arrangements"
 "Cooperative Purchasing Arrangements"
28E AGREEMENT FOR SCHOOL-BASED SUPERVISION PROGRAMS.
 "Joint Employment and Sharing Agreement"
 "Cooperative Purchasing Arrangements".
 "Cooperative Purchasing Arrangements"
 'Joint Operations'
 "Cooperative Purchasing Arrangements".


 69%|██████████████████████████▏           | 11/16 [2:32:00<1:07:57, 815.49s/it]

15.15 Supersedes former agreements.
 "Transfer of Responsibility".
 "Joint Operations"
9.7.1.
 "Joint Operations"
 "Agreement for an Alternative HighSchool Program with Iowa Falls Community School District" is not the correct answer, the correct answer is  "Agreement for an Alternative HighSchool Program with Iowa Falls Community School District" is one of the 28E agreements, the correct answer is  "28E Agreement"
 'Grants/Other Subsidies'.
 "Cooperative Purchasing Arrangements".
 'Joint Operations'.
 "Renewal Of Agreement"
 "Joint Production".
15.24 Additional provisions.
 "28 E Agreement for developmental services" which is a subcategory of "Grants/Other Subsidies".
 "Grants/Other Subsidies".
 "Joint Production".
 "Inter government al Supervision Program Agreement"
 "School Attendance Task Force Agreement" which is a type of "Joint Operations" or more specifically "Cooperative Arrangements" or "Joint Agreement" but the closest match is "Joint Operations"
 "First Amendment to 28 E Agr

 75%|██████████████████████████████          | 12/16 [2:46:06<54:58, 824.69s/it]

 "Cooperative Purchasing Arrangements"
 "Joint Operations"
 "Resource Sharing".
 "Joint Operations"
15.18 Cumulative rights.
 "Cooperative Purchasing Arrangements"
 'Joint Operations'
 "Cooperative Purchasing Arrangements"
9. Set off.
 "Joint Production".
 "Joint Operations"
 "Grants/Other Subsidies".
 'Grants/Other Subsidies'.
 "Cooperative Purchasing Arrangements"
 "Cooperative Purchasing Arrangements".
 'Shared Personnel Agreement'
 'Joint Facilities/Co-location'.
28E Agreement, which is a type of public service agreement. 

The predicted label is: 910 Any Education
 "Joint Production"
 'Joint Facilities/Co-location'.
 "Joint Operations"
 'Cooperative Purchasing Arrangements'.
 'Joint Production'.
 'Joint Production'.
 'Joint Operations'.
 "28E Agreement for Educational Programming for Suspended and Expelled Students"
28E Agreement.
 "Cooperative Purchasing Arrangements".
 "Joint Facilities/Co-location".
 "Joint Facilities/Co-location"
 "Cooperative Purchasing Arrangements".
 'Joint

 81%|████████████████████████████████▌       | 13/16 [2:58:54<40:22, 807.44s/it]

28E Agreement.
 "Joint Operations"
28E MASTER AGREEMENT. 

The predicted label is: 28E MASTER AGREEMENT.
 "Joint Operations"
 "Joint Operations"
 'Grants/Other Subsidies'.
28E Agreement between the City of Des Moines and the Des Moines Independent Community School District.
15.21 Authorization.
 'Joint Production'.
 'Joint Operations'.
15.5 Drug free workplace.
 "Whole Grade Sharing"
 '28E Agreement'
 'Cooperative Purchasing Arrangements'.
 "Cooperative Purchasing Arrangements".
 "Cooperative Purchasing Arrangements".
 "Information Technology Program & Auto Tech"
 "Agreement for an Alternative HighSchool Program with Iowa Falls Community School District"
28 E AGREEMENT CAREER ADVANTAGE PROGRAMS.
 "Joint Production".
 "28E Agreement"
 "Cooperative Purchasing Arrangements".
 'Joint Operations'
 "Joint Production".
 "Grants/Other Subsidies"
 "Cooperative Purchasing Arrangements".
 "Renewal/Modification of an Existing Agreement"
 "Cooperative Purchasing Arrangements".
28E Agreement between

 88%|███████████████████████████████████     | 14/16 [3:10:31<25:48, 774.20s/it]

 "Joint Production".
4.4 otice to District about outcomes.
 'Grants/Other Subsidies'.
28E AGREEMENT WITH CHARLES CITY COMMUNITY SCHOOL DISTRICT AND THE BUILDING BRIDGES INTERVENTION CENTER.
 "Cooperative Purchasing Arrangements".
668.25age 2 (ii) False claims of independent student status; (iii) False claims of citizenship; (iv) Use of false identities; (v) Forgery of signatures or certifications; and (vi) False statements of income; (3) Be jointly and severally liable with the institution to the Secretary for any violation by the servicer of any statutory provision of or applicable to Title IV of the HEA, any regulatory provision prescribed under that statutory authority, and any applicable
28 E AGREEMENT YOUTH AT RISK HIGH SCHOOL PROGRAM.
 "Cooperative Purchasing Arrangements".
 'Joint Facilities/Co-location'.
28 E Agreement. 

The predicted label is: 28 E Agreement.
 'Joint Production'.
28E Agreement, which is a type of Cooperative Agreement. Therefore, the predicted label is "Coope

 94%|█████████████████████████████████████▌  | 15/16 [3:23:37<12:57, 777.72s/it]

 'Corporate Line Agreement and Extensions'.
 "Joint Operations"
 "School-Based Supervision Program" or more specifically, "Joint Operations" or "Cooperative Purchasing Arrangements" but the most fitting class name is "School-Based Supervision Program" or more specifically "Joint Operations" but the most fitting class name is "Joint Operations"
 "Alternative Educational Program for High School Students and Dropouts" which is a subset of "Other" or more specifically "Alternative Educational Program" which is a subset of "Other" but can also be classified as "Alternative Educational Program" which is a subset of "Other" but can also be classified as "Alternative Educational Program" which is a subset of "Other" but can also be classified as "Alternative Educational Program" which is a subset of "Other" but can also be classified as "
 "Cooperative Purchasing Arrangements"
 "Joint Operations"
 "Cooperative Purchasing Arrangements".
15.15 Supersedes former agreements.
28E Agreement, which i

100%|████████████████████████████████████████| 16/16 [3:30:13<00:00, 788.35s/it]

15.5 Drug free workplace.
      PDF_ID                                 Predicted_Category
0    L006807                                 'Joint Operations'
1    M508110             "Cooperative Purchasing Arrangements".
2    M501807                                 "Joint Operations"
3    M035242  28E Agreement between the IOWA VALLEY COMMUNIT...
4    M016858                                     28E Agreement.
..       ...                                                ...
983  L006008              'Cooperative Purchasing Arrangements'
984  M502589                              9.4 otice of default.
985  M013472  28E Agreement between the IOWA SCHOOL CASH ANT...
986  M504984                                                 9.
987  M034463                          15.5 Drug free workplace.

[988 rows x 2 columns]





In [12]:
results_df.to_csv('first_batch.csv')

In [7]:
batch_size = 32
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[1000:2000]), batch_size)):
    batch_df = df.iloc[1000:9000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df2 = pd.DataFrame(results)
#print(results_df)
results_df2.to_csv('batch_2.csv')

100%|████████████████████████████████████████| 32/32 [3:39:54<00:00, 412.32s/it]


In [12]:
batch_size = 32
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[2000:3000]), batch_size)):
    batch_df = df.iloc[2000:3000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df3 = pd.DataFrame(results)
#print(results_df)
results_df3.to_csv('batch_3.csv')

100%|████████████████████████████████████████| 32/32 [4:03:33<00:00, 456.67s/it]


In [None]:
batch_size = 28
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[3000:4000]), batch_size)):
    batch_df = df.iloc[3000:4000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df4 = pd.DataFrame(results)
#print(results_df)
results_df4.to_csv('batch_4.csv')

 97%|██████████████████████████████████████▉ | 35/36 [4:07:42<07:00, 420.39s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (301547 > 131072). Running this sequence through the model will result in indexing errors


In [None]:
batch_size = 64
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[4000:5000]), batch_size)):
    batch_df = df.iloc[4000:5000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df5 = pd.DataFrame(results)
#print(results_df)
results_df5.to_csv('batch_5.csv')

In [11]:
batch_size = 28
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[5000:6000]), batch_size)):
    batch_df = df.iloc[1000:9000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df3 = pd.DataFrame(results)
#print(results_df)
results_df2.to_csv('batch_6.csv')

100%|████████████████████████████████████████| 36/36 [3:38:18<00:00, 363.85s/it]


In [12]:
batch_size = 28
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[6000:7000]), batch_size)):
    batch_df = df.iloc[1000:9000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df3 = pd.DataFrame(results)
#print(results_df)
results_df2.to_csv('batch_7.csv')

100%|████████████████████████████████████████| 36/36 [3:37:11<00:00, 361.99s/it]


In [7]:
batch_size = 32
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[7000:8000]), batch_size)):
    batch_df = df.iloc[7000:8000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df3 = pd.DataFrame(results)
#print(results_df)
results_df2.to_csv('batch_8.csv')

100%|████████████████████████████████████████| 32/32 [1:39:55<00:00, 187.35s/it]


NameError: name 'results_df2' is not defined

In [11]:
results_df3.to_csv('batch_8.csv')

In [10]:
batch_size = 32
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[8000:9000]), batch_size)):
    batch_df = df.iloc[8000:9000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df4 = pd.DataFrame(results)
#print(results_df)
results_df4.to_csv('batch_9.csv')

100%|████████████████████████████████████████| 32/32 [2:40:29<00:00, 300.92s/it]


In [8]:
batch_size = 32
results = []


# Limit to the first 100 rows
for i in tqdm.tqdm(range(0, len(df.iloc[9000:10000]), batch_size)):
    batch_df = df.iloc[9000:10000].iloc[i:i+batch_size]
    
    # Create input texts for the batch
    input_texts = []
    pdf_ids = []
    
    for index, row in batch_df.iterrows():
        unlabeled_text = row['OCR_Text']
        pdf_id = row['PDF_ID']
        
        # Define the input prompt for LLaMA
        input_text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

Given different class names =  {class_names}, classify the following contract:


{unlabeled_text}


Please respond with only the predicted label from the class names list.


<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{pre_condition}"""
        
        input_texts.append(input_text)
        pdf_ids.append(pdf_id)


    # Generate outputs for each prompt individually
    for input_text, pdf_id in zip(input_texts, pdf_ids):
        output = generate(model, tokenizer, prompt=input_text)
        #print(output)
        results.append({
            'PDF_ID': pdf_id,
            #'OCR_Text': unlabeled_text,
            'Predicted_Category': output
        })


# Convert results to DataFrame if needed
results_df4 = pd.DataFrame(results)
#print(results_df)
results_df4.to_csv('batch_10.csv')

100%|████████████████████████████████████████| 32/32 [3:18:22<00:00, 371.95s/it]
