<a href="https://colab.research.google.com/github/ninenine-9/legaldetainment/blob/main/v2_fewshot_COT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🛜 GitHub connection

## For the 1st time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your desired projects folder in Google Drive
# (You can change 'Colab_Projects' to whatever you like)
%cd /content/drive/MyDrive/Colab_Projects/

# Clone the repository directly into this folder
!git clone https://github.com/ninenine-9/legaldetainment.git

## Everyother time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Navigate directly to your project folder inside Google Drive
%cd /content/drive/MyDrive/Colab_Projects/legaldetainment/

# Pull the latest changes from your GitHub repo
!git pull

# Imports

In [None]:
# Install required libraries
!pip install -q -U transformers bitsandbytes accelerate

import pandas as pd
import time
import os
from google.colab import drive
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import json

print("✔️ Imports completed")

# Model loading
_(approx 4 mins)_

In [None]:
# --- Load the LLM and Tokenizer ---

model_id = "mistralai/Mistral-7B-Instruct-v0.1"

# Configure quantization to load the model in 4-bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"  # Automatically maps the model to the available GPU
)

print("✔️ LLM and Tokenizer loaded successfully.")

# Data

In [None]:
# # Data retrieval
# from google.colab import drive

# %cd /content

# !git clone https://github.com/ninenine-9/legaldetainment.git

# drive.mount('/content/drive', force_remount=True)

# !cp -r /content/legaldetainment /content/drive/MyDrive/

# print("✔️ Git Repository successfully cloned")

# # with open("/content/legaldetainment/INPUTS/legaldetainment_story.md", "r", encoding="utf-8") as f:
# #     study_context = f.read()

# # with open("/content/legaldetainment/INPUTS/legaldetainment_codinginstructions.md", "r", encoding="utf-8") as f:
# #     coding_manual = f.read()

# Data loading

blank_df = pd.read_excel("/content/legaldetainment/DATA/legaldetainment_blankdata.xlsx", sheet_name = '30participants') # 📋 for cell H & G

blank_df = blank_df.dropna(subset=["Pno"])
blank_df.columns = blank_df.columns.str.strip()

blank_df = blank_df[["Pno", "Qual", "GuiltMoreLikely",
        "GuiltLessLikely",
        "NoInformation_Evidence",
        "InnocentUntilProvenGuilty",
        "Confound"]]

blank_df = blank_df.head(1)

print(f"Dataset size: {blank_df.shape}")

humandata = pd.read_excel("/content/legaldetainment/DATA/legaldetainment_humandata.xlsx", sheet_name = '30participants') # 📋 for cell I

humandata = humandata.dropna(subset=["Pno"])
humandata = humandata.replace({0: "No", 1: "Yes", "0": "No", "1": "Yes"})
humandata = humandata.rename(columns={
    "NoInformation/Evidence": "NoInformation_Evidence",
    "Unclassified/Other": "Unclassified_Other"
})
humandata.columns = humandata.columns.str.strip()

humandata = humandata[["Pno", "Qual", "GuiltMoreLikely",
        "GuiltLessLikely",
        "NoInformation_Evidence",
        "InnocentUntilProvenGuilty",
        "Confound"]]

humandata = humandata.head(1)

print("✔️ Data loading done")

# Codebook

In [None]:
# LLM-Adapted Codebook
# This structured list of dictionaries translates your human-centric coding instructions into a
# machine-readable format. It is the foundation for building high-performance prompts.

llm_codebook = [
    {
        "Code_Name": "GuiltMoreLikely",
        "Simple_Definition": "This code applies when the participant concludes the defendant is more likely to be guilty because they infer the judge's decision to *deny bail* was based on secret, negative information (like a criminal history).",
        "Inclusion_Criteria": """- The text must connect the judge's decision to **deny bail** (or keep the defendant detained) to a higher likelihood of guilt.
- The text must state or imply that the judge's decision is based on extra information that the participant does not have (e.g., 'the judge must know something,' 'he probably has a record').""",
        "Exclusion_Criteria": """- DO NOT apply if the participant mentions the judge *granting* bail.
- DO NOT apply if the participant simply says the defendant is guilty without linking it to the judge's bail decision.
- DO NOT apply if the participant says there is not enough information to make a judgment (this would be NoInformation_Evidence).
- DO NOT apply if the participant's reasoning is about the judge being biased (this would be Confound).""",
        "Positive_Exemplar_Text": "The judge wouldn't have kept him in jail if there wasn't something else going on. He probably has a criminal history, so I think he's more likely to be guilty.",
        "Positive_Exemplar_Explanation": "This meets all criteria: it mentions detainment, infers the judge knows about a 'criminal history,' and concludes guilt is more likely.",
        "Negative_Exemplar_Text": "He was denied bail, so the judge is probably biased against him now.",
        "Negative_Exemplar_Explanation": "This is a hard negative. It mentions bail denial but focuses on future judge bias, which fits the `Confound` code, not the inference of secret evidence required for `GuiltMoreLikely`."
    },
    {
        "Code_Name": "GuiltLessLikely",
        "Simple_Definition": "This code applies when the participant concludes the defendant is less likely to be guilty because they infer the judge's decision to *grant bail* was based on secret, positive information (like a solid alibi).",
        "Inclusion_Criteria": """- The text must connect the judge's decision to **grant bail** (or release the defendant) to a lower likelihood of guilt.
- The text must state or imply that the judge's decision is based on extra information that the participant does not have (e.g., 'the judge must have seen the alibi').""",
        "Exclusion_Criteria": """- DO NOT apply if the participant mentions the judge *denying* bail.
- DO NOT apply if the participant simply says the defendant is innocent without linking it to the judge's bail decision.
- DO NOT apply if the participant says there is not enough information to make a judgment (this would be NoInformation_Evidence).""",
        "Positive_Exemplar_Text": "If the judge let him go, he must have a solid alibi or something. I'd say he's less likely to be guilty.",
        "Positive_Exemplar_Explanation": "This meets all criteria: it mentions release, infers the judge knows about a 'solid alibi,' and concludes guilt is less likely.",
        "Negative_Exemplar_Text": "It's good he got bail, but we should remember he is innocent until proven guilty anyway.",
        "Negative_Exemplar_Explanation": "This is a hard negative. Although it mentions bail, the core reasoning invokes the legal principle of `InnocentUntilProvenGuilty`, which is a different code."
    },
    {
        "Code_Name": "NoInformation_Evidence",
        "Simple_Definition": "This code applies when the participant explicitly states that they cannot make a judgment because there is not enough information or evidence provided.",
        "Inclusion_Criteria": """- The text must contain phrases that explicitly state a lack of information or evidence, such as 'not enough information,' 'no evidence,' 'can't decide based on this,' or 'need more facts.'""",
        "Exclusion_Criteria": """- DO NOT apply if the participant makes a judgment about guilt or innocence, even if they express some uncertainty (e.g., 'I guess he's guilty, but it's hard to say').
- DO NOT apply if the participant is simply confused about the question itself; the focus must be on the lack of case evidence.""",
        "Positive_Exemplar_Text": "Based on this, I can't say. There's not enough evidence here to make a call.",
        "Positive_Exemplar_Explanation": "This is a direct statement about the lack of evidence preventing a judgment.",
        "Negative_Exemplar_Text": "I lean towards him being guilty, but I don't really have enough information.",
        "Negative_Exemplar_Explanation": "This is a hard negative because even though the participant mentions a lack of information, they still make a judgment ('lean towards him being guilty'), so this code does not apply."
    },
    {
        "Code_Name": "InnocentUntilProvenGuilty",
        "Simple_Definition": "This code applies when the participant references the legal principle that a person is considered innocent until their guilt is proven in court, regardless of other factors.",
        "Inclusion_Criteria": """- The text must explicitly use the phrase 'innocent until proven guilty' or a very close variation (e.g., 'innocent till guilty').
- OR, the text must clearly express the same sentiment, such as 'we shouldn't prejudge him,' or 'none of this matters until the trial proves he did it.'""",
        "Exclusion_Criteria": """- DO NOT apply if the participant simply says the defendant is 'innocent' without referencing the legal principle or the idea of waiting for proof at trial.""",
        "Positive_Exemplar_Text": "It doesn't matter what the judge decided about bail, he is innocent until proven guilty.",
        "Positive_Exemplar_Explanation": "This is a perfect example as it uses the exact key phrase to state the legal principle.",
        "Negative_Exemplar_Text": "I think he's innocent.",
        "Negative_Exemplar_Explanation": "This is a hard negative because it states an opinion on innocence but does not invoke the specific legal principle of 'innocent until proven guilty'."
    },
    {
        "Code_Name": "Confound",
        "Simple_Definition": "This code applies when the participant expresses concern about the fairness of the main trial because the same judge will be involved and may now be biased due to the bail decision.",
        "Inclusion_Criteria": """- The text must state that the **same judge** from the bail hearing will also preside over the main trial.
- AND/OR the text must state that the judge might now be **biased** (either for or against the defendant) as a direct result of the bail decision.""",
        "Exclusion_Criteria": """- DO NOT apply for general statements about the legal system being unfair. The comment must be specific to the judge's potential bias from this specific situation.
- DO NOT apply if the participant thinks the judge has secret information but doesn't mention bias (this would be `GuiltMoreLikely` or `GuiltLessLikely`).""",
        "Positive_Exemplar_Text": "Since it's the same judge for the trial, he might be biased against the defendant now that he's denied him bail.",
        "Positive_Exemplar_Explanation": "This meets both potential criteria: it mentions the 'same judge' and explicitly states the risk of bias.",
        "Negative_Exemplar_Text": "Because each judge is new to the case they will have fresh eyes",
        "Negative_Exemplar_Explanation": "This is a hard negative because it states an opinion on innocence but does not invoke the specific legal principle of 'innocent until proven guilty'."
    }
]

# 🚨 Prompt generation

In [None]:
# PROBABY MAIN PROBLEM

def construct_high_performance_prompt(text_to_analyze, code_entry):
    """
    Constructs a detailed, few-shot prompt with Chain-of-Thought examples
    for a single code classification task.

    Args:
        text_to_analyze (str): The participant's text to be coded.
        code_entry (dict): A dictionary from the llm_codebook for a single code.

    Returns:
        str: A fully formatted prompt ready for the LLM.
    """

    # Format the criteria lists into bulleted strings for clarity in the prompt
    inclusion_criteria_str = "\n".join([f"- {item}" for item in code_entry['Inclusion_Criteria']])
    exclusion_criteria_str = "\n".join([f"- {item}" for item in code_entry['Exclusion_Criteria']])

    # --- Construct the Chain-of-Thought reasoning for the few-shot examples ---
    # This is where we "teach" the model how to think by showing our work.

    # Reasoning for the positive example
    positive_reasoning = (
        f"Step 1: The code '{code_entry['Code_Name']}' is about '{code_entry}'. "
        f"Step 2: The provided text is '{code_entry}'. "
        f"Step 3: I will evaluate this against the criteria. The text meets the inclusion criteria because {code_entry['Positive_Exemplar_Explanation']}. "
        f"Step 4: Therefore, the code applies."
    )

    # Reasoning for the negative example
    negative_reasoning = (
        f"Step 1: The code '{code_entry['Code_Name']}' is about '{code_entry}'. "
        f"Step 2: The provided text is '{code_entry}'. "
        f"Step 3: I will evaluate this against the criteria. The text does not meet the inclusion criteria because {code_entry['Negative_Exemplar_Explanation']}. "
        f"Step 4: Therefore, the code does not apply."
    )

    # --- Assemble the final prompt using an f-string and XML tags for structure ---
    # Using XML tags helps the model differentiate between instructions, examples, and the task.[3]
    prompt = f"""
<Role_and_Task_Definition>
You are an expert qualitative research analyst. Your task is to determine if the code '{code_entry['Code_Name']}' should be applied to the text provided in the <Text_to_Analyze> section.
You must first provide a step-by-step reasoning for your decision within the 'reasoning' field of a JSON object. After your reasoning, you must make a final binary decision ('APPLIES' or 'DOES NOT APPLY') in the 'decision' field.
</Role_and_Task_Definition>

<Codebook_Information>
<Code_Name>{code_entry['Code_Name']}</Code_Name>
<Simple_Definition>{code_entry}</Simple_Definition>
<Inclusion_Criteria>
{inclusion_criteria_str}
</Inclusion_Criteria>
<Exclusion_Criteria>
{exclusion_criteria_str}
</Exclusion_Criteria>
</Codebook_Information>

<Few-Shot_Exemplars>
<Positive_Example>
<Input_Text>{code_entry}</Input_Text>
<Output>
{{
  "reasoning": "{positive_reasoning}",
  "decision": "APPLIES"
}}
</Output>
</Positive_Example>

<Negative_Example>
<Input_Text>{code_entry}</Input_Text>
<Output>
{{
  "reasoning": "{negative_reasoning}",
  "decision": "DOES NOT APPLY"
}}
</Output>
</Negative_Example>
</Few-Shot_Exemplars>

<Input_Data_and_Output_Schema>
<Text_to_Analyze>
{text_to_analyze}
</Text_to_Analyze>
<Final_Output>
Provide your final answer in a single, valid JSON object using the exact format shown in the examples above.
</Final_Output>
</Input_Data_and_Output_Schema>
"""
    return prompt

In [None]:
# alternative 1
def construct_simple_prompt(text_to_analyze, code_entry):
    """
    Constructs a simplified, zero-shot prompt that asks for a direct JSON output
    without complex examples or pre-defined reasoning steps.

    Args:
        text_to_analyze (str): The participant's text to be coded.
        code_entry (dict): A dictionary from the llm_codebook for a single code.

    Returns:
        str: A simplified, formatted prompt.
    """

    # Format the criteria lists into bulleted strings
    inclusion_criteria_str = "\n".join([f"- {item}" for item in code_entry['Inclusion_Criteria']])
    exclusion_criteria_str = "\n".join([f"- {item}" for item in code_entry['Exclusion_Criteria']])

    prompt = f"""
You are an expert qualitative research analyst. Your task is to determine if the code '{code_entry['Code_Name']}' should be applied to the text provided below.

<Codebook_Information>
<Code_Name>{code_entry['Code_Name']}</Code_Name>
<Simple_Definition>{code_entry}</Simple_Definition>
<Inclusion_Criteria>
{inclusion_criteria_str}
</Inclusion_Criteria>
<Exclusion_Criteria>
{exclusion_criteria_str}
</Exclusion_Criteria>
</Codebook_Information>

<Text_to_Analyze>
{text_to_analyze}
</Text_to_Analyze>

Based on the codebook and the text, provide your final answer in a single, valid JSON object with one key, "decision", which must have a value of either "APPLIES" or "DOES NOT APPLY". Do not add any other text before or after the JSON object.

Your output must be only the JSON object.
"""
    return prompt

# LLM inferencing

In [None]:
# PROBLEM
def get_llm_classification(prompt, model, tokenizer):
    """
    Sends a prompt to the LLM and gets a classification decision.

    Args:
        prompt (str): The detailed prompt for the model.
        model: The loaded Hugging Face model.
        tokenizer: The loaded Hugging Face tokenizer.

    Returns:
        dict: A dictionary containing the 'decision' and 'reasoning', or an error.
    """
    # Tokenize the prompt and send it to the GPU
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Get the length of the input tokens
    input_length = inputs.input_ids.shape[-1]

    # Generate a response from the model
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,  # Limit the length of the generated response
        temperature=0.1      # Use a low temperature for more deterministic, less creative output
    )

    # Decode the output and skip the prompt part
    # Extract only the newly generated tokens after the input prompt
    response_text = tokenizer.decode(outputs[0, input_length:], skip_special_tokens=True)


    # --- Robust JSON Parsing ---
    # The model's output is a string; we need to find and parse the JSON within it.
    try:
        # Find the start of the JSON object
        json_start_index = response_text.find('{')
        # Find the end of the JSON object
        json_end_index = response_text.rfind('}') + 1

        if json_start_index!= -1 and json_end_index!= -1:
            json_string = response_text[json_start_index:json_end_index]
            return json.loads(json_string)
        else:
            return {"decision": "PARSE_ERROR", "reasoning": response_text}
    except json.JSONDecodeError:
        return {"decision": "JSON_DECODE_ERROR", "reasoning": response_text}

In [None]:
# Define your output path once at the beginning
# p_amount = int(len(blank_df))
# csv_name = (f"{p_amount}_{model_name[:4]}_v2")
output_path = f"/content/legaldetainment/RESULTS/simpletest.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

results = []
start_time = time.time()

# The main loop iterates through each participant (row) in your blank_df
for index, row in blank_df.iterrows():
    participant_id = row['Pno']
    text_to_analyze = row['Qual']

    # The inner loop processes each code for the current participant
    for code_entry in llm_codebook:
        code_name = code_entry['Code_Name']

        # 1. Construct the prompt
        prompt = construct_simple_prompt(text_to_analyze, code_entry)

        # 2. Get the LLM's classification
        llm_output = get_llm_classification(prompt, model, tokenizer)

        # 3. Append the result dictionary to your list (this is very fast)
        results.append({
            'Pno': participant_id,
            'code_name': code_name,
            'llm_decision': llm_output.get('decision', 'ERROR'),
            'llm_reasoning': llm_output.get('reasoning', 'ERROR')
            #... add other fields you want to save...
        })

    # --- INCREMENTAL SAVE STEP ---
    # After all codes for the current participant are processed, save all results so far.
    # This overwrites the file with the most up-to-date results.
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)

    # Optional: Print a progress update
    print(f"✅ Progress saved. Completed participant {participant_id} ({index + 1}/{len(blank_df)}).")

end_time = time.time()
elapsed = end_time - start_time
print(f"\n✅ Full classification completed in {elapsed:.2f} seconds.")
print(f"✔️ Final results saved to {output_path}")

## Output cleaned

In [None]:
import pandas as pd

# --- 1. Define the path to your source CSV file ---
csv_path = "/content/legaldetainment/RESULTS/simpletest.csv"

try:
    # --- 2. Read the CSV into a pandas DataFrame ---
    results_df = pd.read_csv(csv_path)

    # --- DIAGNOSTIC STEP: Print the actual column names ---
    print("Actual columns found in your CSV file:")
    print(results_df.columns.tolist())
    print("\n" + "="*50 + "\n")
    # --- END DIAGNOSTIC ---

    # --- 3. Select the columns you want for your clean output ---
    # Based on the error, the column is likely named 'llm_decision'.
    # We will select the four columns you wanted in your final table.
    columns_to_keep = ['Pno', 'code_name', 'llm_decision', 'llm_reasoning']

    # We will now create the clean DataFrame using the corrected column names.
    clean_df = results_df[columns_to_keep]

    print("Created a clean DataFrame with the desired columns:")
    display(clean_df.head())

    # --- 4. Save the clean DataFrame to a new CSV file ---
    clean_csv_path = "/content/legaldetainment/RESULTS/clean_simpletest.csv"
    clean_df.to_csv(clean_csv_path, index=False)

    print(f"\n✅ Successfully saved the clean data to: {clean_csv_path}")

except FileNotFoundError:
    print(f"ERROR: File not found at '{csv_path}'. Please double-check the file name and path.")
except KeyError as e:
    print(f"ERROR: A column was not found. Please check the 'Actual columns' list above and ensure the names in 'columns_to_keep' are correct. Details: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
import re
import json
from sklearn.metrics import precision_recall_fscore_support

# --- Step 1: Define the Robust JSON Extraction Function ---
def extract_json_from_text(text):
    """
    Finds and parses the first valid JSON object within a string.

    Args:
        text (str): A string that may contain a JSON object.

    Returns:
        dict: The parsed JSON object, or None if no valid JSON is found.
    """
    # This regex pattern looks for a string starting with { and ending with }
    # It handles nested braces and is non-greedy.
    match = re.search(r'\{.*\}', str(text), re.DOTALL)

    if match:
        json_string = match.group(0)
        try:
            return json.loads(json_string)
        except json.JSONDecodeError:
            return None # Found something that looks like JSON, but it's invalid
    return None # No JSON-like string found


# --- Step 2: Load Your Saved Data and Apply the Salvage Operation ---

# ⚠️ UPDATE THIS PATH to your actual saved CSV file
csv_path = "/content/legaldetainment/RESULTS/salvaged.csv"
results_df = pd.read_csv(csv_path)

print("Re-processing saved results with robust JSON extractor...")

# Apply the new extractor to the 'llm_reasoning' column to find the JSON
salvaged_data = []
for index, row in results_df.iterrows():
    parsed_json = extract_json_from_text(row['llm_reasoning'])
    # Check if JSON was found and if it contains the 'decision' key
    if parsed_json and 'decision' in parsed_json:
        salvaged_data.append({
            'Pno': row['Pno'],
            'code_name': row['code_name'],
            'llm_decision_raw': parsed_json['decision'],
            'human_code': row['human_code']
        })

if not salvaged_data:
    print("\nCould not salvage any valid JSON from the saved results. This indicates a deeper issue with the model's output format.")
else:
    salvaged_df = pd.DataFrame(salvaged_data)

    print(f"\nSuccessfully salvaged {len(salvaged_df)} classifications!")

    # --- Step 3: Run the Evaluation on the SALVAGED Data ---

    # Convert 'APPLIES'/'DOES NOT APPLY' to 1/0
    salvaged_df['llm_decision_numeric'] = salvaged_df['llm_decision_raw'].apply(lambda x: 1 if x == 'APPLIES' else 0)

    # Reshape to "wide" format to match humandata
    model_coded_wide_df = salvaged_df.pivot(index='Pno', columns='code_name', values='llm_decision_numeric')
    model_coded_wide_df.reset_index(inplace=True)
    model_coded_wide_df = model_coded_wide_df.rename_axis(None, axis=1)

    # Merge with Human Data (assuming 'humandata' is loaded and has "Yes"/"No")
    humandata_numeric = humandata.replace({"Yes": 1, "No": 0})
    eval_df = pd.merge(humandata_numeric, model_coded_wide_df, on='Pno', suffixes=('_human', '_model'))

    # --- Step 4: Calculate and Print Performance Metrics ---
    all_human_labels = []
    all_model_labels = []
    code_columns = [col for col in model_coded_wide_df.columns if col!= 'Pno']

    print("\n" + "="*60)
    print("Per-Code Performance Metrics (on SALVAGED results):")
    print("="*60)

    for code in code_columns:
        human_col = f"{code}_human"
        model_col = f"{code}_model"
        if human_col in eval_df.columns and model_col in eval_df.columns:
            valid_evals = eval_df[[human_col, model_col]].dropna()
            all_human_labels.extend(valid_evals[human_col].tolist())
            all_model_labels.extend(valid_evals[model_col].tolist())

            precision, recall, f1, _ = precision_recall_fscore_support(
                valid_evals[human_col], valid_evals[model_col],
                average='binary', pos_label=1, zero_division=0
            )
            print(f"\nMetrics for code: '{code}'")
            print(f"  Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

    print("\n" + "="*60)
    print("Overall Performance Metrics (on SALVAGED results):")
    print("="*60)

    if all_human_labels:
        overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
            all_human_labels, all_model_labels,
            average='binary', pos_label=1, zero_division=0
        )
        print(f"Overall Precision: {overall_precision:.3f}")
        print(f"Overall Recall:    {overall_recall:.3f}")
        print(f"Overall F1-Score:  {overall_f1:.3f}")
    else:
        print("No valid data to calculate overall metrics.")
    print("="*60)

## Performance?

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# --- Load Your Saved Data ---
# Make sure to update this path to your actual CSV file
csv_path = "/content/legaldetainment/RESULTS/latesttest.csv"
results_df = pd.read_csv(csv_path)

print(f"Loaded {len(results_df)} total classifications.")

# --- 1. Filter out the rows where the model failed to produce valid JSON ---
successful_results_df = results_df[~results_df['llm_decision_raw'].isin()].copy()
error_count = len(results_df) - len(successful_results_df)
print(f"Found {error_count} errors ({error_count/len(results_df):.1%}). Analyzing {len(successful_results_df)} successful classifications.\n")

# Convert 'APPLIES'/'DOES NOT APPLY' to 1/0 for the model's decision
successful_results_df['llm_decision_numeric'] = successful_results_df['llm_decision_raw'].apply(lambda x: 1 if x == 'APPLIES' else 0)


# --- 2. Reshape the data from "long" to "wide" format ---
# This pivots the data so each row is one participant and each column is a code, matching your humandata.
model_coded_wide_df = successful_results_df.pivot(index='Pno', columns='code_name', values='llm_decision_numeric')
model_coded_wide_df.reset_index(inplace=True)
model_coded_wide_df = model_coded_wide_df.rename_axis(None, axis=1)

print("Model results reshaped into wide format:")
display(model_coded_wide_df.head())


# --- 3. Merge with Human Data and Evaluate ---
# Make sure your 'humandata' DataFrame is loaded and formatted (1s and 0s)
# This assumes 'humandata' has 1/0 for codes, not "Yes"/"No"
humandata_numeric = humandata.replace({"Yes": 1, "No": 0})

# Merge the dataframes on the participant number
eval_df = pd.merge(
    humandata_numeric,
    model_coded_wide_df,
    on='Pno',
    suffixes=('_human', '_model')
)

print("\nMerged data for evaluation:")
display(eval_df.head())


# --- 4. Calculate and Print Performance Metrics ---
all_human_labels = []
all_model_labels = []

# Get a list of the code columns we are evaluating
code_columns = [col for col in model_coded_wide_df.columns if col!= 'Pno']

for code in code_columns:
    human_col = f"{code}_human"
    model_col = f"{code}_model"

    # Ensure columns exist before trying to access them
    if human_col in eval_df.columns and model_col in eval_df.columns:
        # Drop rows where the model didn't have a prediction (NaN)
        valid_evals = eval_df[[human_col, model_col]].dropna()

        all_human_labels.extend(valid_evals[human_col].tolist())
        all_model_labels.extend(valid_evals[model_col].tolist())

print("\n" + "="*60)
print("Overall Performance Metrics (on successfully decoded results):")
print("="*60)

# Calculate overall metrics
precision, recall, f1, _ = precision_recall_fscore_support(
    all_human_labels,
    all_model_labels,
    average='binary', # Use 'binary' for 1/0 labels
    pos_label=1,
    zero_division=0
)

print(f"Overall Precision: {precision:.3f}")
print(f"Overall Recall:    {recall:.3f}")
print(f"Overall F1-Score:  {f1:.3f}")
print("="*60)

# 🛟 SAVE HERE 🛟

In [None]:
# --- Step 2: Access Your GitHub Token from Secrets ---
from google.colab import userdata
import os

# Get your credentials from Colab's secret manager
github_token = userdata.get('GITHUB_TOKEN')
github_user = "ninenine-9" # Your GitHub username
repo_name = "legaldetainment"

# Construct the secure remote URL
remote_url = f"https://{github_user}:{github_token}@github.com/{github_user}/{repo_name}.git"

# Set the remote URL for this push operation
!git remote set-url origin {remote_url}


# --- Step 3: Configure Git, Add, Commit, and Push ---
!git config --global user.email "nine.adler.20@ucl.ac.uk"
!git config --global user.name {github_user}

# Add all changed files (with the corrected space)
print("\nStaging files...")
!git add .

# Commit the changes
print("\nCommitting files...")
# Using -m flag for the message. Use "git status" to see if there's anything to commit.
!git commit -m "Updated script and results from Colab session"

# Push the changes using the authenticated URL
print("\nPushing to GitHub...")
!git push origin main

in case of emergency

In [None]:
# --- ADD THIS CODE TO MAKE YOUR SCRIPT RESUMABLE ---

# Define the output path (same as before)
output_path = f"/content/legaldetainment/RESULTS/{csv_name}.csv"

processed_pnos = set()

# Check if the results file already exists and has content
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
    try:
        existing_results_df = pd.read_csv(output_path)
        if 'Pno' in existing_results_df.columns:
            processed_pnos = set(existing_results_df['Pno'].unique())
            print(f"Found {len(processed_pnos)} participants already processed. Resuming from where we left off.")
    except pd.errors.EmptyDataError:
        print("Results file is empty. Starting from the beginning.")

# --- Your main loop starts here ---
# The only change is to filter the dataframe you are iterating over

# Filter blank_df to only include participants that have NOT been processed
unprocessed_df = blank_df[~blank_df['Pno'].isin(processed_pnos)]
print(f"Starting classification for {len(unprocessed_df)} remaining participants.")

# Now, iterate over the UNPROCESSED data
for index, row in unprocessed_df.iterrows():
    #... the rest of your loop code remains exactly the same...
    #... it will automatically save to the same CSV file, appending the new results...