In [None]:
# 1. Set up Colab Environment
#    - Open a Colab notebook.
#    - Select a GPU runtime (Runtime > Change runtime type > T4 GPU).

# 2. Install Dependencies
#    - Install necessary libraries:
#        !pip install ollama

# 3. Install Ollama (within Colab)
#    -  Ollama installation might throw warnings in Colab, installing these packages should resolve them
#     !sudo apt update && sudo apt install pciutils lshw
#    - Install Ollama:
#        !curl -fsSL https://ollama.com/install.sh | sh

# 4. Start Ollama Server (within Colab)
#    - Start the Ollama server in the background:
#        !nohup ollama serve > ollama.log 2>&1 &
#    -  Wait for the server to initialize (about 10-20 seconds).

In [None]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.4.7-py3-none-any.whl.metadata (4.7 kB)
Downloading ollama-0.4.7-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.4.7


In [None]:
!sudo apt update && sudo apt install pciutils lshw

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [73.0 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,693 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> NVIDIA GPU installed.
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
!nohup ollama serve > ollama.log 2>&1 &

In [None]:
import ollama
import re
import subprocess
import csv
from typing import List, Tuple



In [None]:
def analyze_messages_with_model(conversation: str, ground_truth: str, model_name: str) -> Tuple[dict, list, list]:
    """
    Args:
        conversation:  A string containing the conversation to be analyzed.
        ground_truth: A string containing the ground truth data.
        model_name: The name of the language model being used.

    Returns:
        A tuple containing:
        - A dictionary containing the evaluation metrics (precision, recall, F1)
          for both Mens Rea and Actus Reus.
        - A list of formatted Mens Rea messages.
        - A list of formatted Actus Reus messages.
    """
    try:
        mens_rea_prompt = f"""
            You are an expert in criminal law and forensic analysis. Given a conversation related to a potential crime, your task is to extract messages based on the legal concept of "mens rea" (guilty mind).

            **Definitions:**

            * **Mens Rea (Guilty Mind):** This refers to the mental state of the perpetrator at the time the crime was committed. It encompasses the intention, knowledge, or recklessness that the person had when performing the act. In essence, it's about proving that the person knew what they were doing was wrong.

            **Input:**

            The input will be a conversation in the following format:

            {conversation}

            **Task:**

            1.  Analyze each message in the provided conversation.
            2.  Identify messages that indicate a guilty mind (mens rea) – messages that reveal intent, knowledge, or planning related to the crime.
            3.  Output the results in the following format.  Do not include any introductory or explanatory text, only the list of messages:

            Mens Rea:
            [Message xx - Person]: <Message text>
            [Message yy - Person]: <Message text>
            ...etc
            """

        actus_reus_prompt = f"""
            You are an expert in criminal law and forensic analysis. Given a conversation related to a potential crime, your task is to extract messages based on the legal concept of "actus reus" (guilty act).

            **Definitions:**

            * **Actus Reus (Guilty Act):** This refers to the physical act of committing a crime. It's the tangible, observable action that constitutes the criminal offense.

            **Input:**

            The input will be a conversation in the following format:

            {conversation}

            **Task:**

            1.  Analyze each message in the provided conversation.
            2.  Identify messages that describe the guilty act itself (actus reus) – messages that detail the actions taken to commit the crime or cover it up.
             3.  Output the results in the following format. Do not include any introductory or explanatory text, only the list of messages:

            Actus Reus:
            [Message xx - Person]: <Message text>
            [Message yy - Person]: <Message text>
            ...etc
            """
        output_actus_reus = predict(model_name, actus_reus_prompt)
        output_mens_rea = predict(model_name, mens_rea_prompt)


        predicted_mens_rea = []
        predicted_actus_reus = []


        mens_rea_messages = re.findall(r"\[Message \d+ - \w+\]: [^\n]+", output_mens_rea)
        actus_reus_messages = re.findall(r"\[Message \d+ - \w+]: [^\n]+", output_actus_reus)
        predicted_mens_rea = [msg.strip() for msg in mens_rea_messages]
        predicted_actus_reus = [msg.strip() for msg in actus_reus_messages]


        evaluation_results = evaluate_model_output(predicted_mens_rea, predicted_actus_reus, ground_truth)
        return evaluation_results, predicted_mens_rea, predicted_actus_reus

    except Exception as e:
        print(f"Error during analysis: {e}")
        return {}, [], []

In [None]:
def predict(model_name: str, prompt: str) -> str:
    """
    Generates a model prediction for a given prompt using Ollama.

    Args:
        model_name: The name of the model to use.
        prompt: The prompt to send to the model.
        num: The index of the current prediction.

    Returns:
        The generated text response from the model.
    """
    response = ollama.chat(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
    )
    return response["message"]["content"]


In [None]:
def load_dataset(dataset_path: str) -> Tuple[List[str], List[str]]:
    """
    Loads the dataset from a CSV file.

    Args:
        dataset_path: The path to the CSV file.

    Returns:
        A tuple containing two lists:
        - The first list contains the conversation prompts.
        - The second list contains the ground truth data.
    """
    prompts = []
    ground_truths = []

    print(f"Loading dataset from {dataset_path}")
    with open(dataset_path, mode="r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prompts.append(row["conversation"])
            ground_truths.append(row["ground truth"])
    return prompts, ground_truths

In [None]:
def load_model(model_name: str) -> None:
    """
    Loads the specified language model using Ollama.

    Args:
        model_name: The name of the model to load (e.g., "gemma3:12b").
    """
    print(f"Pulling model: {model_name}")
    subprocess.run(["ollama", "pull", model_name], check=True)
    print(f"Model '{model_name}' pulled successfully.")

In [None]:
def evaluate_model_output(predicted_mens_rea: list, predicted_actus_reus: list, ground_truth: str) -> dict:
    """
    Evaluates the model's output against the ground truth for both Mens Rea and Actus Reus.

    Args:
        predicted_mens_rea:  A list of messages predicted by the model for Mens Rea.
        predicted_actus_reus: A list of messages predicted by the model for Actus Reus.
        ground_truth: The ground truth string.

    Returns:
        A dictionary containing the precision, recall, and F1-score for both Mens Rea and Actus Reus.
    """


    ground_truth_data = preprocess_ground_truth(ground_truth)
    mens_rea_metrics = calculate_precision_recall_f1(predicted_mens_rea, ground_truth_data["mens_rea"])
    actus_reus_metrics = calculate_precision_recall_f1(predicted_actus_reus, ground_truth_data["actus_reus"])

    return {
        "mens_rea": mens_rea_metrics,
        "actus_reus": actus_reus_metrics,
    }

In [None]:

def calculate_precision_recall_f1(predicted: list, ground_truth: list) -> dict:
    """
    Calculates precision, recall, and F1-score.

    Args:
        predicted: A list of messages predicted by the model.
        ground_truth: A list of messages from the ground truth.

    Returns:
        A dictionary containing precision, recall, and F1-score.
    """
    if not predicted and not ground_truth:
        return {"precision": 1.0, "recall": 1.0, "f1_score": 1.0}

    if not predicted:
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}

    if not ground_truth:
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}

    tp = sum(1 for p in predicted if p in ground_truth)
    fp = len(predicted) - tp
    fn = len(ground_truth) - tp

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {"precision": precision, "recall": recall, "f1_score": f1_score}

In [None]:
def preprocess_ground_truth(ground_truth: str) -> dict:
    """
    Preprocesses the ground truth string to extract Mens Rea and Actus Reus messages.

    Args:
        ground_truth: A string containing the ground truth data in the specified format.

    Returns:
        A dictionary with keys "mens_rea" and "actus_reus", each containing a list of messages.
        Returns empty lists if the sections are not found.
    """

    mens_rea_messages = []
    actus_reus_messages = []


    mens_rea_match = re.search(r"Mens Rea:\n([\s\S]*?)(\nActus Reus:|\Z)", ground_truth)
    actus_reus_match = re.search(r"Actus Reus:\n([\s\S]*)", ground_truth)

    if mens_rea_match:
        mens_rea_text = mens_rea_match.group(1).strip()
        if mens_rea_text:
            mens_rea_messages = re.findall(r"\[Message \d+ - \w+\]: [^\n]+", mens_rea_text)

    if actus_reus_match:
        actus_reus_text = actus_reus_match.group(1).strip()
        if actus_reus_text:
            actus_reus_messages = re.findall(r"\[Message \d+ - \w+\]: [^\n]+", actus_reus_text)


    return {"mens_rea": mens_rea_messages, "actus_reus": actus_reus_messages}

In [None]:
def main():
    """
    Main function to run the message analysis experiment.
    """
    model_name = "gemma3:12b"
    load_model(model_name)

    dataset_path = "/content/true_positives_conversations (2).csv"
    conversations, ground_truths = load_dataset(dataset_path)
    print("Dataset loaded successfully.")

    overall_results = {
        "mens_rea": {"precision": 0, "recall": 0, "f1_score": 0},
        "actus_reus": {"precision": 0, "recall": 0, "f1_score": 0},
    }
    num_conversations = len(conversations)
    output_csv_path = "gemma3_predictions.csv"
    with open(output_csv_path, mode="w", newline="", encoding="utf-8") as csvfile:
        fieldnames = [
            "conversation_index",
            "conversation",
            "ground_truth",
            "mens_rea_prediction",
            "actus_reus_prediction",
            "mens_rea_precision",
            "mens_rea_recall",
            "mens_rea_f1",
            "actus_reus_precision",
            "actus_reus_recall",
            "actus_reus_f1",
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for i, (conversation, ground_truth) in enumerate(zip(conversations[195:],ground_truths[195:])):
            print(f"Analyzing conversation {num_conversations - len(conversations[195:])+i} of {num_conversations}:")
            results, mens_rea_output, actus_reus_output = analyze_messages_with_model(conversation, ground_truth, model_name)
            print(f"Evaluation results for conversation {i + 1}:")
            print(results)

            writer.writerow(
                {
                    "conversation_index": num_conversations - len(conversations[195:])+i + 1,
                    "conversation": conversation,
                    "ground_truth": ground_truth,
                    "mens_rea_prediction": mens_rea_output,
                    "actus_reus_prediction": actus_reus_output,
                    "mens_rea_precision": results["mens_rea"]["precision"],
                    "mens_rea_recall": results["mens_rea"]["recall"],
                    "mens_rea_f1": results["mens_rea"]["f1_score"],
                    "actus_reus_precision": results["actus_reus"]["precision"],
                    "actus_reus_recall": results["actus_reus"]["recall"],
                    "actus_reus_f1": results["actus_reus"]["f1_score"],
                }
            )


            overall_results["mens_rea"]["precision"] += results["mens_rea"]["precision"]
            overall_results["mens_rea"]["recall"] += results["mens_rea"]["recall"]
            overall_results["mens_rea"]["f1_score"] += results["mens_rea"]["f1_score"]

            overall_results["actus_reus"]["precision"] += results["actus_reus"]["precision"]
            overall_results["actus_reus"]["recall"] += results["actus_reus"]["recall"]
            overall_results["actus_reus"]["f1_score"] += results["actus_reus"]["f1_score"]


    for category in ["mens_rea", "actus_reus"]:
        overall_results[category]["precision"] /= num_conversations
        overall_results[category]["recall"] /= num_conversations
        overall_results[category]["f1_score"] /= num_conversations

    print("\nOverall Evaluation Results:")
    print(overall_results)


if __name__ == "__main__":
    main()


Pulling model: gemma3:12b
Model 'gemma3:12b' pulled successfully.
Loading dataset from /content/true_positives_conversations (2).csv
Dataset loaded successfully.
Analyzing conversation 195 of 200:
Evaluation results for conversation 1:
{'mens_rea': {'precision': 0.6111111111111112, 'recall': 0.9166666666666666, 'f1_score': 0.7333333333333334}, 'actus_reus': {'precision': 0.2857142857142857, 'recall': 0.2222222222222222, 'f1_score': 0.25}}
Analyzing conversation 196 of 200:
Evaluation results for conversation 2:
{'mens_rea': {'precision': 0.1111111111111111, 'recall': 0.375, 'f1_score': 0.17142857142857143}, 'actus_reus': {'precision': 0.25, 'recall': 0.42857142857142855, 'f1_score': 0.3157894736842105}}
Analyzing conversation 197 of 200:
Evaluation results for conversation 3:
{'mens_rea': {'precision': 0.5833333333333334, 'recall': 0.30434782608695654, 'f1_score': 0.4}, 'actus_reus': {'precision': 0.5714285714285714, 'recall': 0.8, 'f1_score': 0.6666666666666666}}
Analyzing conversatio

In [None]:
def evalmetric_from_prestored_data(file_name):
  import pandas as pd
  f = pd.read_csv(file_name)
  length_of_data = len(f)
  precision_mens_rea = sum(f["mens_rea_precision"].tolist())/len(f)
  recall_mens_rea = sum(f["mens_rea_recall"].tolist())/len(f)
  f1_score_mens_rea = sum(f["mens_rea_f1"].tolist())/len(f)
  precision_actus_reus = sum(f["actus_reus_precision"].tolist()) / len(f)
  recall_actus_reus = sum(f["actus_reus_recall"].tolist()) / len(f)
  f1_score_actus_reus = sum(f["actus_reus_f1"].tolist()) / len(f)
  return {"mens_rea": {"precision": precision_mens_rea, "recall": recall_mens_rea, "f1_score": f1_score_mens_rea}, "actus_reus": {"precision": precision_actus_reus, "recall": recall_actus_reus, "f1_score": f1_score_actus_reus}}


In [None]:
results = evalmetric_from_prestored_data("/content/evaluations_gemma3 - gemma3_predictions.csv")

198


In [None]:
for i in results:
  print("Crime element: {}\n Evaluation metrics:{}".format(i, results[i]))

Crime element: mens_rea
 Evaluation metrics:{'precision': 0.47789640839893954, 'recall': 0.588145181184343, 'f1_score': 0.5003866544429296}
Crime element: actus_reus
 Evaluation metrics:{'precision': 0.3813628229732829, 'recall': 0.5300570097368688, 'f1_score': 0.41793140176060606}
