## Calculating negotiation efficiency
This notebook will calculate negotiation efficiency via the number of turns and the amount of tokens. Efficient tokenisation helps reduce the amount of computing power required for training and inference

In [1]:
pip install python-docx tiktoken pandas

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-macosx_10_13_x86_64.whl.metadata (6.7 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tiktoken-0.9.0-cp312-cp312-macosx_10_13_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: python-docx, tiktoken
Successfully installed python-docx-1.2.0 tiktoken-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from docx import Document
import tiktoken
import pandas as pd

# Initialize tokenizer 
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(tokenizer.encode(text))

def analyze_negotiation(doc_path):
    doc = Document(doc_path)
    full_text = "\n".join([para.text for para in doc.paragraphs])
    
    # Split turns
    user_turns = []
    model_turns = []
    turns = full_text.split('\n')
    
    for turn in turns:
        if turn.startswith("User:"):
            user_turns.append(turn.replace("User:", "").strip())
        elif turn.startswith("Model:"):
            model_turns.append(turn.replace("Model:", "").strip())
    
    # Count turns
    user_turn_count = len(user_turns)
    model_turn_count = len(model_turns)
    total_turns = user_turn_count + model_turn_count
    
    # Count tokens
    user_tokens = sum(count_tokens(turn) for turn in user_turns)
    model_tokens = sum(count_tokens(turn) for turn in model_turns)
    total_tokens = user_tokens + model_tokens
    
    return {
        "User Turns": user_turn_count,
        "Model Turns": model_turn_count,
        "Total Turns": total_turns,
        "User Tokens": user_tokens,
        "Model Tokens": model_tokens,
        "Total Tokens": total_tokens,
    }

# Process all .docx files in the 'replacement' folder
folder_path = "Replacement"
results = []

for filename in os.listdir(folder_path):
    if filename.endswith(".docx"):
        doc_path = os.path.join(folder_path, filename)
        stats = analyze_negotiation(doc_path)
        stats["Document"] = filename
        results.append(stats)

# Convert to df
df = pd.DataFrame(results)
df = df[["Document", "User Turns", "Model Turns", "Total Turns", "User Tokens", "Model Tokens", "Total Tokens"]]

# Calculate averages
averages = {
    "Document": "AVERAGE",
    "User Turns": df["User Turns"].mean(),
    "Model Turns": df["Model Turns"].mean(),
    "Total Turns": df["Total Turns"].mean(),
    "User Tokens": df["User Tokens"].mean(),
    "Model Tokens": df["Model Tokens"].mean(),
    "Total Tokens": df["Total Tokens"].mean(),
}

# Append averages to the df
df_averages = pd.DataFrame([averages])
df_final = pd.concat([df, df_averages], ignore_index=True)

# Save to CSV 
output_path = "negotiation_stats.csv"
df_final.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

# Print the table
print("\nNegotiation Statistics:")
print(df_final.to_string(index=False))

Results saved to negotiation_stats.csv

Negotiation Statistics:
           Document  User Turns  Model Turns  Total Turns  User Tokens  Model Tokens  Total Tokens
 Participant 9.docx    5.000000     5.000000     10.00000    93.000000    236.000000    329.000000
Participant 20.docx   10.000000    10.000000     20.00000   609.000000    727.000000   1336.000000
Participant 16.docx   13.000000    13.000000     26.00000   624.000000   1094.000000   1718.000000
 Participant 5.docx    6.000000     6.000000     12.00000   235.000000    452.000000    687.000000
 Participant 4.docx    8.000000     8.000000     16.00000   329.000000    706.000000   1035.000000
Participant 17.docx    6.000000     6.000000     12.00000   268.000000    415.000000    683.000000
Participant 21.docx    6.000000     6.000000     12.00000   250.000000    348.000000    598.000000
 Participant 8.docx    8.000000     8.000000     16.00000   123.000000    131.000000    254.000000
Participant 10.docx    4.000000     4.000000 

In [23]:
import re

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(tokenizer.encode(text))

def extract_conversation(text, role):
    """Extracts a specific conversation section from the document text"""
    pattern = r"\d+ and AI \(" + role + r"\)(.*?)(?=\d+ and AI \(|$)"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ""

def extract_participant_conversation(text, pair):
    """Extracts the participant-participant conversation"""
    pattern = rf"{pair[0]} and {pair[1]}(.*?)(?=\d+ and AI |$)"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ""

def analyze_conversation(text, prefix1, prefix2):
    """Analyzes a conversation with two roles"""
    turns = [line.strip() for line in text.split('\n') if line.strip()]
    role1_turns = []
    role2_turns = []
    
    for turn in turns:
        if turn.startswith(prefix1):
            role1_turns.append(turn.replace(prefix1, "").strip())
        elif turn.startswith(prefix2):
            role2_turns.append(turn.replace(prefix2, "").strip())
    
    role1_count = len(role1_turns)
    role2_count = len(role2_turns)
    total_turns = role1_count + role2_count
    
    role1_tokens = sum(count_tokens(turn) for turn in role1_turns)
    role2_tokens = sum(count_tokens(turn) for turn in role2_turns)
    total_tokens = role1_tokens + role2_tokens
    
    return {
        f"{prefix1} Turns": role1_count,
        f"{prefix2} Turns": role2_count,
        "Total Turns": total_turns,
        f"{prefix1} Tokens": role1_tokens,
        f"{prefix2} Tokens": role2_tokens,
        "Total Tokens": total_tokens
    }

def process_document(doc_path):
    """Processes a single document and extracts all conversation metrics"""
    doc = Document(doc_path)
    full_text = "\n".join([para.text for para in doc.paragraphs])
    
    # Extract participant numbers from filename (e.g., "24_25.docx" -> ["24", "25"])
    filename = os.path.basename(doc_path)
    pair = re.findall(r"\d+", filename)[:2]
    
    # Extract and analyze each conversation type
    support_text = extract_conversation(full_text, "Support")
    undermine_text = extract_conversation(full_text, "Undermine")
    participant_text = extract_participant_conversation(full_text, pair)
    
    support_stats = analyze_conversation(support_text, "User:", "Model:")
    undermine_stats = analyze_conversation(undermine_text, "User:", "Model:")
    participant_stats = analyze_conversation(participant_text, "HR Representative:", "Employee:")
    
    return {
        "Document": filename,
        "Pair": f"{pair[0]}_{pair[1]}",
        **{f"Support_{k}": v for k, v in support_stats.items()},
        **{f"Undermine_{k}": v for k, v in undermine_stats.items()},
        **{f"Participant_{k}": v for k, v in participant_stats.items()},
    }

# Process all .docx files in the 'US data' folder
folder_path = "US data"
results = []

for filename in os.listdir(folder_path):
    if filename.endswith(".docx"):
        doc_path = os.path.join(folder_path, filename)
        stats = process_document(doc_path)
        results.append(stats)

# Convert to df
df = pd.DataFrame(results)

# Calculate averages
averages = {
    "Document": "AVERAGE",
    "Pair": "",
    "Support_User: Turns": df["Support_User: Turns"].mean(),
    "Support_Model: Turns": df["Support_Model: Turns"].mean(),
    "Support_Total Turns": df["Support_Total Turns"].mean(),
    "Support_User: Tokens": df["Support_User: Tokens"].mean(),
    "Support_Model: Tokens": df["Support_Model: Tokens"].mean(),
    "Support_Total Tokens": df["Support_Total Tokens"].mean(),
    "Undermine_User: Turns": df["Undermine_User: Turns"].mean(),
    "Undermine_Model: Turns": df["Undermine_Model: Turns"].mean(),
    "Undermine_Total Turns": df["Undermine_Total Turns"].mean(),
    "Undermine_User: Tokens": df["Undermine_User: Tokens"].mean(),
    "Undermine_Model: Tokens": df["Undermine_Model: Tokens"].mean(),
    "Undermine_Total Tokens": df["Undermine_Total Tokens"].mean(),
    "Participant_HR Representative: Turns": df["Participant_HR Representative: Turns"].mean(),
    "Participant_Employee: Turns": df["Participant_Employee: Turns"].mean(),
    "Participant_Total Turns": df["Participant_Total Turns"].mean(),
    "Participant_HR Representative: Tokens": df["Participant_HR Representative: Tokens"].mean(),
    "Participant_Employee: Tokens": df["Participant_Employee: Tokens"].mean(),
    "Participant_Total Tokens": df["Participant_Total Tokens"].mean(),
}

# Append averages to the df
df_averages = pd.DataFrame([averages])
df_final = pd.concat([df, df_averages], ignore_index=True)

# Save to CSV
output_path = "data_us.csv"
df_final.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

# Print the total table  
summary_cols = [
    "Document", "Pair",
    "Support_Total Turns", "Support_Total Tokens",
    "Undermine_Total Turns", "Undermine_Total Tokens",
    "Participant_Total Turns", "Participant_Total Tokens"
]
print("\nSummary Statistics:")
print(df_final[summary_cols].to_string(index=False))

Results saved to data_us.csv

Summary Statistics:
      Document  Pair  Support_Total Turns  Support_Total Tokens  Undermine_Total Turns  Undermine_Total Tokens  Participant_Total Turns  Participant_Total Tokens
28 and 29.docx 28_29                 14.0                 562.0                    4.0                   231.0                     15.0                     517.0
40 and 41.docx 40_41                 20.0                 883.0                   12.0                   727.0                      7.0                     261.0
42 and 43.docx 42_43                 14.0                 807.0                   36.0                  2141.0                      7.0                     452.0
26 and 27.docx 26_27                 22.0                1157.0                   14.0                   882.0                     14.0                     938.0
24 and 25.docx 24_25                 10.0                 228.0                   10.0                   248.0                     12.0     

In [31]:
pip install python-docx


Note: you may need to restart the kernel to use updated packages.


In [43]:
import os
import re
import csv
from docx import Document
import tiktoken

# Tokenizer for GPT-4
tokenizer = tiktoken.get_encoding("cl100k_base")

# Folder with Word documents
folder_path = "US data"

# Output CSV file
output_csv = "participant_token_stats.csv"

# Helper: Count tokens
def count_tokens(text):
    return len(tokenizer.encode(text))

# Helper: Extract only Participant conversation
def extract_participant_conversation(doc_text):
    # Find the last conversation in the doc (Participant section)
    conversations = re.split(r'\n\d{2} and AI \((Support|Undermine)\)\n', doc_text)
    return conversations[-1] if conversations else doc_text

# Prepare data rows for CSV
data_rows = []

# Process each .docx file
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".docx"):
        pair_id = filename.replace(".docx", "")
        doc_path = os.path.join(folder_path, filename)
        doc = Document(doc_path)

        # Combine all paragraphs into one string
        full_text = "\n".join(p.text for p in doc.paragraphs)

        # Extract only the Participant conversation
        participant_text = extract_participant_conversation(full_text)

        # Normalize speaker labels to lowercase for consistent matching
        participant_text = participant_text.replace("HR Representative:", "HR representative:")

        # Extract turns using case-sensitive pattern
        turns = re.findall(
            r"(Employee|HR representative):\s+(.*?)(?=\n(?:Employee|HR representative):|\Z)",
            participant_text, re.DOTALL
        )

        # Per-file stats
        stats = {
            "Employee": {"turns": 0, "tokens": 0},
            "HR representative": {"turns": 0, "tokens": 0}
        }

        for speaker, utterance in turns:
            utterance = utterance.strip()
            if utterance:
                token_count = count_tokens(utterance)
                stats[speaker]["turns"] += 1
                stats[speaker]["tokens"] += token_count

        total_turns = stats["Employee"]["turns"] + stats["HR representative"]["turns"]
        total_tokens = stats["Employee"]["tokens"] + stats["HR representative"]["tokens"]
        avg_tokens_per_turn = total_tokens / total_turns if total_turns else 0

        # Append row for this pair
        data_rows.append([
            pair_id,
            stats["Employee"]["turns"], stats["Employee"]["tokens"],
            stats["HR representative"]["turns"], stats["HR representative"]["tokens"],
            total_turns, total_tokens, round(avg_tokens_per_turn, 2)
        ])

# Write to CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "Pair",
        "Employee Turns", "Employee Tokens",
        "HR Turns", "HR Tokens",
        "Total Turns", "Total Tokens", "Avg Tokens per Turn"
    ])
    writer.writerows(data_rows)

print(f"Exported results to {output_csv}")


Exported results to participant_token_stats.csv


In [41]:
# Compute overall averages
num_files = len(data_rows)
sum_total_turns = sum(row[5] for row in data_rows)   # Total Turns column
sum_total_tokens = sum(row[6] for row in data_rows)  # Total Tokens column

avg_total_turns = sum_total_turns / num_files if num_files else 0
avg_total_tokens = sum_total_tokens / num_files if num_files else 0

print("\nAverages across all participant conversations:")
print(f"Average Total Turns: {avg_total_turns:.2f}")
print(f"Average Total Tokens: {avg_total_tokens:.2f}")



Averages across all participant conversations:
Average Total Turns: 14.90
Average Total Tokens: 739.20
