Loads the libraries necessary for the script and also loads the environment variables in the .env file.

In [1]:
%reload_ext dotenv
%dotenv

import openai
import json
import csv
import time
import ollama
import sqlite3
import hashlib

from pathlib import Path
from textwrap import dedent

from tqdm import tqdm

CACHE_DB = Path("temp/gold_cache.db")

def init_cache(db_path=CACHE_DB):
    Path(db_path).parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(db_path, timeout=30, isolation_level=None)
    conn.execute("PRAGMA journal_mode=WAL;")
    conn.execute("PRAGMA synchronous=FULL;")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS cache (
            key TEXT PRIMARY KEY,
            model TEXT,
            prompt_hash TEXT,
            result TEXT,
            ts INTEGER
        )
    """)
    return conn

def hash_prompt(prompt):
    return hashlib.sha256(json.dumps(prompt, sort_keys=True, ensure_ascii=False).encode('utf-8')).hexdigest()

def make_key(message_id, model, shot_label):
    return f"{message_id}::{model}::{shot_label}"

def get_cached(conn, key):
    cur = conn.execute("SELECT result FROM cache WHERE key = ?", (key,))
    row = cur.fetchone()
    return json.loads(row[0]) if row else None

def set_cache(conn, key, model, prompt, result):
    ph = hash_prompt(prompt)
    ts = int(time.time())
    conn.execute(
        "INSERT OR REPLACE INTO cache (key, model, prompt_hash, result, ts) VALUES (?, ?, ?, ?, ?)",
        (key, model, ph, json.dumps(result, ensure_ascii=False), ts)
    )

Setup for utilizing the OpenAI and Ollama APIs. Also loads the dataset.

In [2]:
cache_conn = init_cache()

openai_client = openai.OpenAI()
ollama_client = ollama.Client(timeout=120)

import csv

dataset = []
with open("data/github_gold.csv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    for row in reader:
        if 'Text' in row and row['Text'].strip() != '':
            dataset.append(row['Text'])

Defines helper methods for asking questions to GPT and Ollama. The ollama version contains a retry part, since, especially on windows, ollama tends to fail to response sometimes.

In [None]:
def ask_chatgpt(prompts, model):
    response = openai_client.chat.completions.create(
        model=model,
        messages=prompts,
        response_format={"type": "json_object"},
        temperature=0
    )
    return response.choices[0].message.content
        
def ask_ollama(prompts, model):
    retry_count = 0
    while True:
        try:
            response = ollama_client.chat(model=model, messages=prompts, format="json", stream=False,
            options={
                "temperature": 0,
                "num_ctx": 8192,
                "num_predict": -1
            })
            return response['message']['content']
        except Exception as e:
            time.sleep(5)
            print(f"Failed with {e}. Retrying...")
            retry_count += 1
            if retry_count > 5:
                return None
            continue

This cell defines helper methods that generate the prompt depending on the structure utilized by the model (OpenAI and Ollama can differ) and the prompt engineering technique utilized).

In [None]:
def get_system_prompt():
    return {"role": "system",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.
            """)}

def get_system_prompt_with_message(message):
    return {"role": "user",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.
                
                Message: "{message}"
            """)}

def get_system_prompt_with_message_and_examples(message, num_examples, cot=False):
    return {"role": "user",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.
                
                Here are some examples to help you get started:
                {get_example_strings(num_examples, cot)}
                
                Message: "{message}"
            """)}

def get_examples(num_positive, num_negative, num_neutral):
    positive = [
        {
            "message": "This is just to good to be true. I'm almost crying 'caus of joy. Thanks a lot, this makes life a lot easier!",
            "reasoning": "Expresses joy and gratitude, with phrases like 'almost crying' indicating positive emotion.",
        },
        {
            "message": "Finally! Thanks for the fix!",
            "reasoning": "Shows satisfaction and appreciation (finally, thanks), indicating a positive reaction to the change.",
        },
        {
            "message": "hehe yeah I first asked to myself, why we have to ask for defined? and not just initialize the value in the initializer. I started to delete that stuff and when I saw the initialize method I realize about the typo :).",
            "reasoning": "Lighthearted tone (hehe, smiley) and amusement about the mistake; no anger, overall friendly/positive.",
        },
    ]
    
    neutral = [
        {
            "message": "This is incorrectly placed. It should be under the method signature.",
            "reasoning": "A factual correction about code structure with no praise, anger, or other emotional language.",
        },
        {
            "message": "I'm not sure we need to do this based on checksums.  If somebody touches a file, we should probably send a notification of a change.  I think we should avoid the checksums unless there's a need.",
            "reasoning": "Reasoned technical discussion and recommendation; uncertainty is about design choice, not an emotional reaction.",
        },
        {
            "message": "`$return` will only have the last `$child`",
            "reasoning": "A neutral observation about program behavior/bug (what the variable contains), stated without sentiment.",
        },
    ]
    
    negative = [
        {
            "message": "This code again! Will this hack never tire of creating new issues.",
            "reasoning": "Shows exasperation and blame ('hack', 'again'), expressing frustration/anger toward the code.",
        },
        {
            "message": "Ahhh yes. Brain fail, sorry!",
            "reasoning": "Expresses self-frustration and apology (brain fail), which reflects negative affect rather than neutrality.",
        },
        {
            "message": "Hate the double colons.  Bring back the old style!",
            "reasoning": "Explicitly conveys dislike ('hate') and a complaint about style, indicating negative sentiment.",
        },
    ]
    
    return positive[0:num_positive], neutral[0:num_neutral], negative[0:num_negative]

def get_example_shots(num):
    
    classes = {}
    example_shots = []
    
    classes['positive'], classes['neutral'], classes['negative'] = get_examples(num, num, num)
    
    for key, value in classes.items():
        for message in value:
            example_shots.append({
                "role": "user",
                "content": message["message"],
            })
            example_shots.append({
                "role": "assistant",
                "content": f'{"sentiment": "{key}"}',
            })
        
    return example_shots
        
def get_example_strings(num, cot=False):
    message = ""
    classes = {}
    
    classes['positive'], classes['neutral'], classes['negative'] = get_examples(num, num, num)
    
    i = 1
    for key, value in classes.items():
        for msg in value:
            message += f'Example {i}: {msg["message"]}\nResponse for Example {i}: {{"sentiment": "{key}"}}\n'
            if cot:
                message += f'Reasoning for Example {i}: {msg["reasoning"]}\n'
            i += 1
            
    return message

def get_final_prompt(message):
    return {"role": "user",
            "content": message,}

# Data collection for the first research question (RQ1)

This first cell collects data from the GPT models (zero-shot).

In [None]:
cached_keys = set([r[0] for r in cache_conn.execute("SELECT key FROM cache")])

results = [None] * len(dataset)

GPT_MODELS = ["gpt-4o-2024-05-13", "gpt-4o-mini-2024-07-18"]

for model in GPT_MODELS:
    run_name = model + f"_0shot" + ".json"
    if Path("output/"+ run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        key = make_key(index, model, "0shot")
        if key in cached_keys:
            results[index] = get_cached(cache_conn, key)
            continue
        prompt = [get_system_prompt()] + [get_final_prompt(message)]
        model_response = ask_chatgpt(prompt, model)
        try:
            parsed = json.loads(model_response)
            if parsed:
                results[index] = parsed
                set_cache(cache_conn, key, model, prompt, parsed)
                cached_keys.add(key)
        except:
            continue
            
    with open(f"output/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)

This second cell collects the data from the local models, without providing any examples (zero-shot).

In [None]:
cached_keys = set([r[0] for r in cache_conn.execute("SELECT key FROM cache")])

results = [None] * len(dataset)

OLLAMA_MODELS = ["mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", "gemma2:27b", "deepseek-r1:8b", "deepseek-r1:32b", "llama3.1:70b"]

for model in OLLAMA_MODELS:
    run_name = model.replace(":", "-") + f"_0shot" + ".json"
    if Path("output_gold/" + run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        key = make_key(index, model, "0shot")
        if key in cached_keys:
            results[index] = get_cached(cache_conn, key)
            continue
        prompt = [get_system_prompt_with_message(message)]
        model_response = ask_ollama(prompt, model)
        try:
            parsed = json.loads(model_response)
            if parsed:
                results[index] = parsed
                set_cache(cache_conn, key, model, prompt, parsed)
                cached_keys.add(key)
        except:
            continue
            
    with open(f"output_gold/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)

# Data collection for the second research question (RQ2)

This first cell collects the data for the OpenAI models, for the one-shot and few-shot cases.

In [None]:
cached_keys = set([r[0] for r in cache_conn.execute("SELECT key FROM cache")])

results = [None] * len(dataset)

GPT_MODELS = ["gpt-4o-mini-2024-07-18", "gpt-4o-2024-05-13"]

for model in GPT_MODELS:
    for number in [1, 3]:
        run_name = model + f"_{str(number)}shot" + ".json"
        if Path("output/"+ run_name).exists():
            continue
        for index, message in tqdm(enumerate(dataset), desc=run_name):
            key = make_key(index, model, f"{str(number)}shot")
            if key in cached_keys:
                results[index] = get_cached(cache_conn, key)
                continue
            prompt = [get_system_prompt()] + get_example_shots(number) + [get_final_prompt(message)]
            model_response = ask_chatgpt(prompt, model)
            try:
                parsed = json.loads(model_response)
                if parsed:
                    results[index] = parsed
                    set_cache(cache_conn, key, model, prompt, parsed)
                    cached_keys.add(key)
            except:
                continue
            
        with open(f"output/{run_name}", "w") as jsonfile:
            json.dump(results, jsonfile)

This second cell collects the data for the ollama models, for the one-shot and few-shot cases.

In [None]:
cached_keys = set([r[0] for r in cache_conn.execute("SELECT key FROM cache")])

results = [None] * len(dataset)

OLLAMA_MODELS = ["mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", "gemma2:27b", "deepseek-r1:8b", "deepseek-r1:32b", "llama3.1:70b"]

for model in OLLAMA_MODELS:
    for number in [1, 3]:
        run_name = model.replace(":", "-") + f"_{str(number)}shot" + ".json"
        if Path("output/" + run_name).exists():
            continue
        for index, message in tqdm(enumerate(dataset), desc=run_name):
            key = make_key(index, model, f"{str(number)}shot")
            if key in cached_keys:
                results[index] = get_cached(cache_conn, key)
                continue
            prompt = [get_system_prompt_with_message_and_examples(message, number)]
            model_response = ask_ollama(prompt, model)
            try:
                parsed = json.loads(model_response)
                if parsed:
                    results[index] = parsed
                    set_cache(cache_conn, key, model, prompt, parsed)
                    cached_keys.add(key)
            except:
                continue

        with open(f"output/{run_name}", "w") as jsonfile:
            json.dump(results, jsonfile)

This third cell collects the data for the OpenAI models, for the chain of thought cases.

In [None]:
cached_keys = set([r[0] for r in cache_conn.execute("SELECT key FROM cache")])

results = [None] * len(dataset)

GPT_MODELS = ["gpt-4o-mini-2024-07-18", "gpt-4o-2024-05-13"]

for model in GPT_MODELS:
    run_name = model + "_cotshot" + ".json"
    if Path("output/"+ run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        key = make_key(index, model, "cotshot")
        if key in cached_keys:
            results[index] = get_cached(cache_conn, key)
            continue
        prompt = [get_system_prompt_with_message_and_examples(message, number, cot=True)]
        model_response = ask_chatgpt(prompt, model)
        try:
            parsed = json.loads(model_response)
            if parsed:
                results[index] = parsed
                set_cache(cache_conn, key, model, prompt, parsed)
                cached_keys.add(key)
        except:
            continue
            
    with open(f"output/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)

This fourth cell collects the data for the ollama models, for the chain of thought cases.

In [None]:
cached_keys = set([r[0] for r in cache_conn.execute("SELECT key FROM cache")])

results = [None] * len(dataset)

OLLAMA_MODELS = ["mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", "gemma2:27b", "deepseek-r1:8b", "deepseek-r1:32b", "llama3.1:70b"]

for model in OLLAMA_MODELS:
    run_name = model.replace(":", "-") + "_cotshot" + ".json"
    if Path("output/" + run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        key = make_key(index, model, "cotshot")
        if key in cached_keys:
            results[index] = get_cached(cache_conn, key)
            continue
        prompt = [get_system_prompt_with_message_and_examples(message, number, cot=True)]
        model_response = ask_ollama(prompt, model)
        try:
            parsed = json.loads(model_response)
            if parsed:
                results[index] = parsed
                set_cache(cache_conn, key, model, prompt, parsed)
                cached_keys.add(key)
        except:
            continue

    with open(f"output/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)