In [None]:
!pip install openai -q

import openai
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import time
from google.colab import drive
drive.mount('/content/drive')

client = OpenAI(api_key= #edited out to push to GitHub)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.environ['OPENAI_API_KEY'] = #edited out to push to github

In [None]:
curated_df = pd.read_csv('/content/drive/MyDrive/Dissertation/curated_expanded.csv')
pilot_df = pd.read_csv('/content/drive/MyDrive/Dissertation/pilotdata.csv')

### Prompting Conditions

In [None]:
prompting_conditions = {
    'default': {
        'name': 'Default',
        'system': "You are a helpful assistant that summarizes text.",
        'user': "Summarize the following deposition testimony:\n\n{text}"
    },

    'feature_preserving': {
        'name': 'Feature-Preserving',
        'system': "You are a summarization assistant that preserves linguistic nuance.",
        'user': """Summarize the following deposition testimony.
IMPORTANT: Preserve all hedging language (I think, maybe, possibly),
disfluencies (um, uh, repetitions), modal expressions (could, might, would),
and temporal markers. Do not strip or clean up uncertain language.

{text}"""
    },

    'legal_context': {
        'name': 'Legal-Context',
        'system': "You are a legal document summarization assistant.",
        'user': """Summarize this legal deposition excerpt for case review.
Maintain all uncertainty markers, hedges, and speech patterns as they may be
legally significant for assessing witness credibility and testimony accuracy.

{text}"""
    },

    'bias_aware': {
        'name': 'Bias-Aware',
        'system': "You are a linguistically-aware summarization assistant.",
        'user': """Summarize this testimony without imposing standard language norms.
Preserve disfluencies, hedges, and uncertain language that may be characteristic
of the speaker's communication style, gender, or language background.

{text}"""
    }
}


### Summarization Function

In [None]:
def generate_gpt_summary(text, condition='default', model='gpt-3.5-turbo'):
    """
    Generate summary using OpenAI API with specified prompting condition

    Args:
        text: Input text to summarize
        condition: One of 'default', 'feature_preserving', 'legal_context', 'bias_aware'
        model: 'gpt-3.5-turbo' (cheaper) or 'gpt-4' (better)
    """

    if pd.isna(text) or not text or len(str(text).strip()) < 10:
        return ""

    prompt_config = prompting_conditions[condition]

    try:
        # New v1.0+ syntax
        client = OpenAI(api_key=#edited out to push to github
            model=model,
            messages=[
                {"role": "system", "content": prompt_config['system']},
                {"role": "user", "content": prompt_config['user'].format(text=text)}
            ],
            max_tokens=150,
            temperature=0.3,
            top_p=0.95
        )

        return response.choices[0].message.content

    except openai.RateLimitError:
        print("Rate limit hit, waiting 20 seconds...")
        time.sleep(20)
        return generate_gpt_summary(text, condition, model)

    except Exception as e:
        print(f"Error: {e}")
        return ""


### Processing Datasets with all Conditions

In [None]:
MODEL = 'gpt-3.5-turbo'

print(f"\n Using model: {MODEL}")
print("="*60)

for condition_key, condition_info in prompting_conditions.items():

    print(f"\nProcessing with {condition_info['name']} Condition")
    print("-"*50)

    # CURATED DATASET
    print(f"\nProcessing curated dataset...")
    curated_summaries = []

    for idx, row in tqdm(curated_df.iterrows(), total=len(curated_df),
                        desc=f"Curated-{condition_key}"):
        text = row['Excerpt']
        summary = generate_gpt_summary(text, condition_key, MODEL)
        curated_summaries.append(summary)

        time.sleep(0.5)  # Half second between requests

        if idx % 20 == 0 and idx > 0:
            curated_df[f'gpt_{condition_key}'] = curated_summaries + [''] * (len(curated_df) - len(curated_summaries))
            curated_df.to_csv(f'/content/drive/MyDrive/Dissertation/gpt/curated_gpt_{condition_key}_temp.csv', index=False)

    curated_df[f'gpt_{condition_key}'] = curated_summaries

    # PILOT DATASET
    print(f"\nProcessing pilot dataset...")
    pilot_summaries = []

    for idx, row in tqdm(pilot_df.iterrows(), total=len(pilot_df),
                        desc=f"Pilot-{condition_key}"):
        text = row['TEXT']
        summary = generate_gpt_summary(text, condition_key, MODEL)
        pilot_summaries.append(summary)

        time.sleep(0.5)

        if idx % 20 == 0 and idx > 0:
            pilot_df[f'gpt_{condition_key}'] = pilot_summaries + [''] * (len(pilot_df) - len(pilot_summaries))
            pilot_df.to_csv(f'/content/drive/MyDrive/Dissertation/gpt/pilot_gpt_{condition_key}_temp.csv', index=False)

    pilot_df[f'gpt_{condition_key}'] = pilot_summaries

    print(f"✓ Completed {condition_info['name']} condition")



🤖 Using model: gpt-3.5-turbo

📊 Processing with Default Condition
--------------------------------------------------

Processing curated dataset...


Curated-default: 100%|██████████| 132/132 [03:07<00:00,  1.42s/it]



Processing pilot dataset...


Pilot-default: 100%|██████████| 351/351 [07:45<00:00,  1.33s/it]


✓ Completed Default condition

📊 Processing with Feature-Preserving Condition
--------------------------------------------------

Processing curated dataset...


Curated-feature_preserving: 100%|██████████| 132/132 [03:12<00:00,  1.46s/it]



Processing pilot dataset...


Pilot-feature_preserving: 100%|██████████| 351/351 [08:08<00:00,  1.39s/it]


✓ Completed Feature-Preserving condition

📊 Processing with Legal-Context Condition
--------------------------------------------------

Processing curated dataset...


Curated-legal_context: 100%|██████████| 132/132 [03:03<00:00,  1.39s/it]



Processing pilot dataset...


Pilot-legal_context: 100%|██████████| 351/351 [07:59<00:00,  1.36s/it]


✓ Completed Legal-Context condition

📊 Processing with Bias-Aware Condition
--------------------------------------------------

Processing curated dataset...


Curated-bias_aware: 100%|██████████| 132/132 [03:13<00:00,  1.47s/it]



Processing pilot dataset...


Pilot-bias_aware: 100%|██████████| 351/351 [08:05<00:00,  1.38s/it]

✓ Completed Bias-Aware condition



