# Evaluating Legal Models
LegalEase: Nick, Ben, Lisa, Devin

# Setup
### Install and import necessary packages:


In [None]:
# Installs
!pip install google-generativeai
!pip install transformers
!pip install torch
!pip install peft
!pip install PyPDF2
!pip install bitsandbytes
!pip install langchain-community
!pip install langchain-Chroma
!pip install accelerate
!pip install optimum
!pip install gptqmodel
!pip install auto-gptq
!pip install gpt4all

Collecting google-generativeai
  Downloading google_generativeai-0.8.4-py3-none-any.whl.metadata (4.2 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.161.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client->google-generativeai)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client->google-generativeai)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client->google-generativeai)
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting protobuf (from google-generativeai)
  Downloading protobuf-4.25.6-cp310-abi3-win_amd

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub>=0.25.0 (from peft)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
   ---------------------------------------- 0.0/374.8 kB ? eta -:--:--
   --------------------------------------  368.6/374.8 kB 11.6 MB/s eta 0:00:01
   --------------------------------------- 374.8/374.8 kB 11.8 MB/s eta 0:00:00
Downloading accelerate-1.4.0-py3-none-any.whl (342 kB)
   ---------------------------------------- 0.0/342.1 kB ? eta -:--:--
   --------------------------------------- 342.1/342.1 kB 20.7 MB/s eta 0:00:00
Downloading huggingface_hub-0.29.1-py3-none-any.whl (468 kB)
   ---------------------------------------- 0.0/468.0 kB ? eta -:--:--
   --------------------------------------- 468.0/468.0 kB 30.5 MB/s eta 0:00:00


Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.37 (from langchain-community)
  Downloading langchain_core-0.3.37-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.19 (from langchain-community)
  Downloading langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.0-py3-none-any.whl.metadata (3.5 kB)
Collecting langsmith<0.4,>=0.1.125 (from langchain-community)
  Downloading langsmith-0.3.10-py3-none-any.whl.metadata (14 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-commun

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
googletrans 4.0.0rc1 requires httpx==0.13.3, but you have httpx 0.28.1 which is incompatible.


In [None]:
# Imports
import pandas as pd
import time
import numpy as np
import PyPDF2
import json
import random
import os
import google.generativeai as genai
import warnings
warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import login

In [None]:
# Our modules
from model import generate_response, init_model
import rag
import chat_history

### Gather resources:

In [None]:
# Gathering Resources
with open('secrets.json', 'r') as file:
    secrets = json.load(file)
    
GOOGLE_API_KEY = secrets['GOOGLE_API_KEY']
genai.configure(api_key = GOOGLE_API_KEY)
import pdfplumber

qa_pairs = ""  

with pdfplumber.open('full_qa_pairs.pdf') as pdf:
    for page in pdf.pages:
        text = page.extract_text() 
        qa_pairs += text 


In [None]:
qa_pairs = qa_pairs.replace('QUESTION', 'Question')  
qa_pairs = qa_pairs.replace('ANSWER', 'Answer')
qa_split = qa_pairs.split('Question:')
for i in range(len(qa_split)):
    qa_split[i] = qa_split[i].split('Answer:')  
# Removes white space from pairs
qa_split = [[segment.strip() for segment in qa] for qa in qa_split][1:]

print('Pairs:', qa_split)

# Example Q_A

In [None]:
# Only uncomment if you want to see how the Q_A script currently runs.
#import Q_A

# Load Gemini

SOTA model that will be our benchmark to evaluate general response performace against. 

In [None]:
def query_gemini(user_input, use_rag, prompt):
    """
    user_input is the input to the llm.
    Rag is a boolean if you want RAG to be used to enhance query or not
    returns a response text
    """
    rag_context = ''
    if use_rag:
        rag_context, sources = rag.query_rag(user_input)
    chat_history = ''
    user_input_with_context = prompt.format(user_input=user_input, rag_context=rag_context, chat_history=chat_history)
    
    model = genai.GenerativeModel('gemini-1.5-flash',system_instruction=user_input_with_context)
    chat = model.start_chat()
    response = chat.send_message(user_input)
    return response.text.strip()

# Load Our Model
Mistral 7B model trained on QLora Q/A Pairs.

In [None]:
with open("prompt.txt", "r") as file:
    prompt = file.read()
model, tokenizer = init_model()
with open('prompt_short.txt', 'r') as file:
    prompt_short  = file.read()


def query_model(user_input, use_rag, prompt):
    """
    query is the input to the llm.
    Rag is a boolean if you want RAG to be used to enhance query or not
    returns a response text
    """
    rag_context = ''
    if use_rag:
        rag_context, sources = rag.query_rag(user_input)
    chat_history = ''
    user_input_with_context = prompt.format(user_input=user_input, rag_context=rag_context, chat_history=chat_history)
    new_input = 'For Context:' + rag_context + "Answer the following:" + user_input
    llm_output = generate_response(user_input_with_context, model, tokenizer)
    return llm_output

# Compare Gemini's Response to our Model's Response:

## On an Open Ended Question: 'How do i start a business?'
Gemini without RAG:

In [None]:
open_question = 'How do i start a business?'
start_time = time.time()  
# Without RAG
response = query_gemini(open_question, False, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

Gemini with RAG:

In [None]:
start_time = time.time()  
# Without RAG
response = query_gemini(open_question, True, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

In [None]:
start_time = time.time()  
# Without RAG
response = query_model(open_question, False, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

In [None]:
start_time = time.time()  
# With RAG
response = query_model(open_question, True, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

## Compare on a closed Question: "Can I open a non-profit in Washington State if I am a registered fellon?"

Gemini Without Rag:

In [None]:
closed_question = "Can I open a mon-profit in Washin gton State if I am a registered fellon?"
start_time = time.time()  
# Without RAG
response = query_gemini(closed_question, False, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

In [None]:
start_time = time.time()  
# With RAG
response = query_gemini(closed_question, True, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

In [None]:
start_time = time.time()  
# Without RAG
response = query_model(closed_question, False, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

In [None]:
start_time = time.time()  
# With RAG
response = query_model(closed_question, True, prompt)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

## Our Responses are looking Odd sometimes:

Our model is responding to the few shot examples in our prompt for some reason. We have both instructions, the rag context, and the few shot being passed which may be too much. Se will remove the few shot examples and see how this changes performance.

In [None]:
start_time = time.time()  
# Without RAG
response = query_model(closed_question, False, prompt_short)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

In [None]:
start_time = time.time()  
# With RAG
response = query_model(closed_question, True, prompt_short)

end_time = time.time()

print('Time elapsed:', end_time - start_time)
print('Response:', response)

In [None]:
def run_through_questions(model_type, use_rag, prompt):
    # Runs given model through 100 questions from the set
    # returns the average cosine similarity
    scores = []
    content_sim = []
    # Select first 100 pairs
    qa_split_100 = qa_split[:100] 

    for pair in qa_split_100:
        query = pair[0]
        answer = pair[1]
        response = ''
        
        if model_type == 'g':
            response = query_gemini(query, use_rag, prompt)
            # We will be querying twice in this script, and cannot exceed 15 a minute.
            time.sleep(8.01)
        else:
            response = query_model(query, use_rag, prompt)
            r = response.split('**Response**')
            if len(r) > 1:
                response = r[1]
        
        # Vectorize the query and response
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([query, response])

        # Compute the cosine similarity between query and response
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
        
        # Append the cosine similarity score to scores
        scores.append(similarity[0][0])
        
        # use Geval to determine if answer/response are similar in content
        model = genai.GenerativeModel('gemini-1.5-flash',system_instruction=user_input_with_context)
        chat = model.start_chat()
        directions="You are an evaluator. You need to determine if the following paragraphs are similar in content."
        directions += 'Paragraph 1:' + answer
        directions += 'Paragraph 2:' + 
        directions = 'Return 1 if they are similar, or 0 if they are not. Only return a 1 or 0.'
        res = chat.send_message(user_input)
        try:
            res = int(res)
        except:
            res = 0
        content_sim.append(res)
    # Calculate the average similarity
    avg_similarity = sum(scores) / len(scores) if scores else 0
    avg_similarity_content = sum(content_sim) / len(content_sim) if content_sim else 0
    return avg_similarity, avg_similarity_content



In [None]:
# Get the cosine similarity of our responses vs gemini compared to the text:
gem_wo = run_through_questions('g', False, prompt_short)
print('Gemini W/O Rag:')
print('Cosine Similarity:', gem_wo[0], 'Content_similarity:', gem_wo[1])

gem_w = run_through_questions('g', True, prompt_short)
print('Gemini W Rag:')
print('Cosine Similarity:', gem_w[0], 'Content_similarity:', gem_w[1])

m_wo = run_through_questions('m', False, prompt_short)
print('Model W/O Rag:')
print('Cosine Similarity:', m_wo[0], 'Content_similarity:', m_wo[1])

m_w = run_through_questions('m', True, prompt_short)
print('Model W/O Rag:')
print('Cosine Similarity:', m_wo[0], 'Content_similarity:', m_wo[1])
             
