# Randomness Analysis

## Setup and Imports

In [None]:
%reload_ext dotenv
%dotenv

import openai
import json
import csv
import time
import ollama
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from textwrap import dedent
from tqdm import tqdm
from sklearn.metrics import cohen_kappa_score, accuracy_score

random.seed(42)
np.random.seed(42)

openai_client = openai.OpenAI()
ollama_client = ollama.Client(timeout=120)

## Load Dataset and Sample Selection

In [None]:
# Load the gold dataset
dataset = []
with open("data/github_gold.csv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    for row in reader:
        if 'Text' in row and row['Text'].strip() != '':
            dataset.append(row['Text'])

print(f"Total dataset size: {len(dataset)}")

# Sample 50 prompts for testing
SAMPLE_SIZE = 50
sample_indices = random.sample(range(len(dataset)), SAMPLE_SIZE)
sample_prompts = [dataset[i] for i in sample_indices]

print(f"Selected {SAMPLE_SIZE} random samples at indices: {sorted(sample_indices)[:10]}...")

## Helper Functions for Prompting

In [None]:
def get_system_prompt():
    return {"role": "system",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.
            """)}

def get_system_prompt_with_message(message):
    return {"role": "user",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.

                Classify the following message:
                {message}
            """)}

def get_final_prompt(message):
    return {"role": "user", "content": f"Classify the following message:\n{message}"}

## API Call Functions with Additional Stats (throughput)

We opted to not use this additional data due to the how reasoning models (deepseek) inaccurately report token (since they do not report reasoning tokens). We also could not consistently measure GPT execution due to network overhead.

In [None]:
def ask_chatgpt_with_stats(prompts, model, temperature=0.0):
    start_time = time.time()
    response = openai_client.chat.completions.create(
        model=model,
        messages=prompts,
        response_format={"type": "json_object"},
        temperature=temperature
    )
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    completion_tokens = response.usage.completion_tokens
    prompt_tokens = response.usage.prompt_tokens
    total_tokens = response.usage.total_tokens
    
    tokens_per_second = completion_tokens / elapsed_time if elapsed_time > 0 else 0
    
    return {
        'content': response.choices[0].message.content,
        'tokens_per_second': tokens_per_second,
        'completion_tokens': completion_tokens,
        'prompt_tokens': prompt_tokens,
        'total_tokens': total_tokens,
        'elapsed_time': elapsed_time
    }

def ask_ollama_with_stats(prompts, model, temperature=0.0):
    retry_count = 0
    while True:
        try:
            response = ollama_client.chat(
                model=model, 
                messages=prompts, 
                format="json", 
                stream=False,
                options={
                    "temperature": temperature,
                    "num_ctx": 8192,
                    "num_predict": -1
                }
            )
            
            content = response['message']['content']
            eval_count = response.get('eval_count', 0)
            eval_duration = response.get('eval_duration', 0)
            prompt_eval_count = response.get('prompt_eval_count', 0)
            prompt_eval_duration = response.get('prompt_eval_duration', 0)
            
            generation_tokens_per_second = 0
            if eval_duration > 0:
                generation_tokens_per_second = eval_count / (eval_duration / 1e9)
            
            total_tokens = eval_count + prompt_eval_count
            total_duration = eval_duration + prompt_eval_duration
            total_tokens_per_second = 0
            if total_duration > 0:
                total_tokens_per_second = total_tokens / (total_duration / 1e9)
            
            return {
                'content': content,
                'tokens_per_second': generation_tokens_per_second,
                'total_tokens_per_second': total_tokens_per_second, 
                'generation_tokens_per_second': generation_tokens_per_second, 
                'eval_count': eval_count,
                'eval_duration': eval_duration,
                'prompt_eval_count': prompt_eval_count,
                'prompt_eval_duration': prompt_eval_duration,
                'total_duration': total_duration
            }
        except Exception as e:
            time.sleep(5)
            print(f"Failed with {e}. Retrying...")
            retry_count += 1
            if retry_count > 5:
                return None
            continue


## Part 1: Randomness Analysis - Run Models 3 Times

In [None]:
GPT_MODELS = ["gpt-4o-2024-05-13", "gpt-4o-mini-2024-07-18"]
OLLAMA_MODELS = ["mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", 
                 "gemma2:27b", "deepseek-r1:8b", "deepseek-r1:32b", "llama3.1:70b"]

randomness_results = {
    'gpt': {},
    'ollama': {}
}

throughput_data = []

NUM_RUNS = 3

print(f"Testing {len(GPT_MODELS)} GPT models and {len(OLLAMA_MODELS)} Ollama models")
print(f"Each model will be run {NUM_RUNS} times on {SAMPLE_SIZE} prompts")

### Run GPT Models (3 times each)

In [None]:
for model in GPT_MODELS:
    print(f"\n{'='*60}")
    print(f"Testing model: {model}")
    print(f"{'='*60}")
    
    model_results = []
    model_throughput = []
    
    for run_num in range(NUM_RUNS):
        run_results = []
        run_throughput = []
        print(f"\nRun {run_num + 1}/{NUM_RUNS}")
        
        for message in tqdm(sample_prompts, desc=f"{model} - Run {run_num + 1}"):
            prompt = [get_system_prompt()] + [get_final_prompt(message)]
            
            try:
                response = ask_chatgpt_with_stats(prompt, model, temperature=0.0)
                parsed = json.loads(response['content'])
                sentiment = parsed.get('sentiment', 'unknown')
                run_results.append(sentiment)
                
                run_throughput.append(response['tokens_per_second'])
                throughput_data.append({
                    'model': model,
                    'run': run_num + 1,
                    'tokens_per_second': response['tokens_per_second'],
                    'completion_tokens': response['completion_tokens'],
                    'prompt_tokens': response['prompt_tokens'],
                    'total_tokens': response['total_tokens'],
                    'elapsed_time': response['elapsed_time']
                })
            except Exception as e:
                print(f"Error: {e}")
                run_results.append('error')
                run_throughput.append(0)
            
            time.sleep(0.1)
        
        model_results.append(run_results)
        model_throughput.append(run_throughput)
        
        avg_throughput = np.mean([t for t in run_throughput if t > 0])
        print(f"Average throughput for run {run_num + 1}: {avg_throughput:.2f} tokens/sec")
    
    randomness_results['gpt'][model] = model_results
    print(f"Completed {model}")

### Run Ollama Models (3 times each) with Throughput Measurement

In [None]:
for model in OLLAMA_MODELS:
    print(f"\n{'='*60}")
    print(f"Testing model: {model}")
    print(f"{'='*60}")
    
    model_results = []
    model_throughput = []
    
    for run_num in range(NUM_RUNS):
        run_results = []
        run_throughput = []
        print(f"\nRun {run_num + 1}/{NUM_RUNS}")
        
        for idx, message in enumerate(tqdm(sample_prompts, desc=f"{model} - Run {run_num + 1}")):
            prompt = [get_system_prompt_with_message(message)]
            
            is_warmup = (idx == 0)
            
            try:
                response = ask_ollama_with_stats(prompt, model, temperature=0.0)
                if response:
                    parsed = json.loads(response['content'])
                    sentiment = parsed.get('sentiment', 'unknown')
                    run_results.append(sentiment)
                    
                    run_throughput.append(response['total_tokens_per_second'])
                    throughput_data.append({
                        'model': model,
                        'run': run_num + 1,
                        'message_idx': idx,
                        'is_warmup': is_warmup,
                        'tokens_per_second': response['tokens_per_second'], 
                        'total_tokens_per_second': response['total_tokens_per_second'],
                        'generation_tokens_per_second': response['generation_tokens_per_second'],
                        'eval_count': response['eval_count'],
                        'eval_duration_ns': response['eval_duration'],
                        'prompt_eval_count': response['prompt_eval_count'],
                        'prompt_eval_duration_ns': response['prompt_eval_duration'],
                        'total_duration_ns': response['total_duration']
                    })
                else:
                    run_results.append('error')
                    run_throughput.append(0)
            except Exception as e:
                print(f"Error: {e}")
                run_results.append('error')
                run_throughput.append(0)
        
        model_results.append(run_results)
        model_throughput.append(run_throughput)
        
        avg_throughput = np.mean([t for t in run_throughput if t > 0])
        print(f"Average total throughput for run {run_num + 1}: {avg_throughput:.2f} tokens/sec")
    
    randomness_results['ollama'][model] = model_results
    print(f"Completed {model}")


### Save Results

In [None]:
Path("output_throughput").mkdir(exist_ok=True)
with open("output_throughput/randomness_results.json", "w") as f:
    json.dump(randomness_results, f, indent=2)

throughput_df = pd.DataFrame(throughput_data)
throughput_df.to_csv("output_throughput/throughput_data.csv", index=False)

print("Results saved to output_throughput/")

## Part 2: Consistency Analysis

In [None]:
consistency_metrics = []

def calculate_agreement(run1, run2):
    valid_pairs = [(r1, r2) for r1, r2 in zip(run1, run2) if r1 != 'error' and r2 != 'error']
    if not valid_pairs:
        return 0, 0
    
    r1_vals, r2_vals = zip(*valid_pairs)
    accuracy = accuracy_score(r1_vals, r2_vals)
    kappa = cohen_kappa_score(r1_vals, r2_vals)
    return accuracy, kappa

for model, runs in randomness_results['gpt'].items():
    agreements = []
    kappas = []
    
    for i in range(len(runs)):
        for j in range(i+1, len(runs)):
            acc, kappa = calculate_agreement(runs[i], runs[j])
            agreements.append(acc)
            kappas.append(kappa)
    
    consistency_metrics.append({
        'model': model,
        'model_type': 'GPT',
        'avg_accuracy': np.mean(agreements),
        'avg_kappa': np.mean(kappas),
        'min_accuracy': np.min(agreements),
        'max_accuracy': np.max(agreements),
        'std_accuracy': np.std(agreements),
        'min_kappa': np.min(kappas),
        'max_kappa': np.max(kappas),
        'std_kappa': np.std(kappas)
    })

for model, runs in randomness_results['ollama'].items():
    agreements = []
    kappas = []
    
    for i in range(len(runs)):
        for j in range(i+1, len(runs)):
            acc, kappa = calculate_agreement(runs[i], runs[j])
            agreements.append(acc)
            kappas.append(kappa)
    
    consistency_metrics.append({
        'model': model,
        'model_type': 'Ollama',
        'avg_accuracy': np.mean(agreements),
        'avg_kappa': np.mean(kappas),
        'min_accuracy': np.min(agreements),
        'max_accuracy': np.max(agreements),
        'std_accuracy': np.std(agreements),
        'min_kappa': np.min(kappas),
        'max_kappa': np.max(kappas),
        'std_kappa': np.std(kappas)
    })

consistency_df = pd.DataFrame(consistency_metrics)
consistency_df = consistency_df.sort_values('avg_kappa', ascending=False)
print("\nConsistency Metrics (sorted by Cohen's Kappa):")
print(consistency_df.to_string(index=False))

pd.save_csv(consistency_df, "output_throughput/consistency_metrics.csv", index=False)

### Visualize Consistency Results

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 6))

consistency_df_sorted = consistency_df.sort_values('avg_kappa')
colors = ['#2ecc71' if mt == 'GPT' else '#3498db' for mt in consistency_df_sorted['model_type']]

ax.barh(range(len(consistency_df_sorted)), consistency_df_sorted['avg_kappa'], color=colors)
ax.set_yticks(range(len(consistency_df_sorted)))
ax.set_yticklabels(consistency_df_sorted['model'])
ax.set_xlabel("Cohen's Kappa (Average across run pairs)")
ax.set_title("Model Consistency: Cohen's Kappa Between Runs")
ax.axvline(x=0.8, color='red', linestyle='--', alpha=0.5, label='High Agreement (Îº>0.8)')
ax.legend()
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('output_throughput/consistency_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Consistency visualization saved to output_throughput/consistency_comparison.png")