In [1]:
from openai import OpenAI
from datasets import Dataset
from tqdm import tqdm
from sklearn.metrics import classification_report

import os
import pandas as pd
import random
import time
import numpy as np

In [2]:
def generate_prompt(test_sample, num_few_shot_samples=3, few_shot_data=None):
    
    task_description = """Perform Sentiment classification task.
Given the text assign a sentiment label from ['positive', 'negative', 'neutral'].
Return label only without any other text.\n"""

    for i in range(num_few_shot_samples + 1):
        if i != num_few_shot_samples:
            sample = few_shot_data[i]
            text, label = sample['text'], sample['label']

        if i == num_few_shot_samples:
            task_description += f"\n<text>: {test_sample}\n<sentiment>:"
        else:
            task_description += f"\n<text>: {text}\n<sentiment>: {label}\n"

    return task_description

In [3]:
def sample_prompt_instances(train_dataset, num_samples, num_of_runs, seed):
    """
    Function that samples few-shot instaces for each label
    """
    positive_samples = []
    neutral_samples = []
    negative_samples = []

    # Collect samples for each label
    for sample in train_dataset:
        if sample["label"] == "positive":
            positive_samples.append(sample)
        elif sample["label"] == "neutral":
            neutral_samples.append(sample)
        else:
            negative_samples.append(sample)

    random.seed(seed)
    if num_samples in (1, 3):
        pos_train_samples = random.choices(positive_samples, k=num_of_runs)
        neut_train_samples = random.choices(neutral_samples, k=num_of_runs)
        neg_train_samples = random.choices(negative_samples, k=num_of_runs)

        if num_samples == 1:
            # For num_of_runs of 1-shot testing sample each num_of_runs times 
            return pos_train_samples, neut_train_samples, neg_train_samples
        
        else:
            exp1 = [pos_train_samples[0], neut_train_samples[0], neg_train_samples[0]]
            exp2 = [pos_train_samples[1], neut_train_samples[1], neg_train_samples[1]]
            exp3 = [pos_train_samples[2], neut_train_samples[2], neg_train_samples[2]]

            return exp1, exp2, exp3  

In [4]:
def run_test(test_data, chosen_samples, num_samples, client, str2int):

    """
    Function that sends requests to ChatGPT model through API
    """
    temperature = 0  # for determenistic outputs
    predicted_labels = []
    error_analysis_list = []  # for further error analysis 

    for text in tqdm(test_data):
        prompt = generate_prompt(text, num_samples, chosen_samples)

        completion  = client.chat.completions.create(
            model='gpt-3.5-turbo-1106',
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=temperature
        )
        predicted_label = completion.choices[0].message.content
        predicted_labels.append(str2int[predicted_label.strip()])
        error_analysis_list.append({"sample": text, "pred_label": predicted_label})
    
    return predicted_labels, error_analysis_list

In [6]:
def get_avg_score(reports):
    negative_prec, negative_rec, negative_f1 = [], [], []
    positive_prec, positive_rec, positive_f1 = [], [], []
    neutral_prec, neutral_rec, neutral_f1 = [], [], []
    micro_f1, macro_f1 = [], []

    for report in reports:
        negative_prec.append(report['-1']['precision'])
        negative_rec.append(report['-1']['recall'])
        negative_f1.append(report['-1']['f1-score'])

        positive_prec.append(report['1']['precision'])
        positive_rec.append(report['1']['recall'])
        positive_f1.append(report['1']['f1-score'])

        neutral_prec.append(report['0']['precision'])
        neutral_rec.append(report['0']['recall'])
        neutral_f1.append(report['0']['f1-score'])

        macro_f1.append(report['macro avg']['f1-score'])
        micro_f1.append(report['weighted avg']['f1-score'])

    avg_results = {'Negative precison': np.mean(negative_prec),
                    'Negative recall': np.mean(negative_rec),
                    'Negative f1-score': np.mean(negative_f1),
                    'Netural precison': np.mean(neutral_prec),
                    'Netural recall': np.mean(neutral_rec),
                    'Netural f1-score': np.mean(neutral_f1),
                    'Positive precison': np.mean(positive_prec),
                    'Positive recall': np.mean(positive_rec),
                    'Positive f1-score': np.mean(positive_f1),
                    'Macro avg': np.mean(macro_f1), 
                    'Weighted avg': np.mean(micro_f1)}
    
    return avg_results

In [7]:
def test(samples, num_samples, test_name, test_data, golden_labels, client, str2int):
    reports = []

    if num_samples == 1:
        # One-shot learnign scenario 
        for sample in samples:
            predicted_labels, err_list = run_test(test_data, [sample], num_samples=num_samples, client=client, str2int=str2int)
            reports.append(classification_report(golden_labels, predicted_labels, output_dict=True))
    else:
        # Few-shot learning scenario 
        for sample in samples:
            predicted_labels, err_list = run_test(test_data, sample, num_samples=num_samples, client=client, str2int=str2int)
            reports.append(classification_report(golden_labels, predicted_labels, output_dict=True))
        
    avg_results = get_avg_score(reports)  # average the results 

    print(f"Results for {test_name}, avg. for 3 runs:")
    for metric, result in avg_results.items():
        print(f'{metric:25}{result:.4f}')

    # Saving results for error-analysis
    
    with open(f"{test_name}.txt", "w", encoding="utf-8") as f:
        for i, gold in enumerate(golden_labels):
            f.write(f"pred: {err_list[i]['pred_label']}, gold: {gold}\n")
            f.write(f" {err_list[i]['sample']}\n\n") 

In [8]:
data_path = "./data"

train_path = os.path.join(data_path, "train_data.csv")
test_path = os.path.join(data_path, "test_data.csv")

int2str = {-1: 'negative', 0: 'neutral', 1: 'positive'}
str2int = {v:k for k, v in int2str.items()}

train_df, test_df = pd.read_csv(train_path), pd.read_csv(test_path)
train_df['label'] = train_df['label'].map(int2str)

train_dataset = Dataset.from_pandas(train_df) 
test_dataset =  Dataset.from_pandas(test_df)

test_texts, golden_labels = test_dataset['text'], test_dataset['label']

os.environ['OPENAI_API_KEY'] = "enter-key-here"
client = OpenAI()

In [None]:
# Samples formatted for one-shot experiments
one_shot_all_samples = sample_prompt_instances(train_dataset, num_samples=1, num_of_runs=3, seed=7)

# 1-shot positive-sample tests
test(one_shot_all_samples[0], 1, "1-shot exp. with positive sample", test_texts, golden_labels, client, str2int)

# 1-shot neut-sample tests
test(one_shot_all_samples[1], 1, "1-shot exp. with neutral sample", test_texts, golden_labels, client, str2int)

# 1-shot negative-sample tests
test(one_shot_all_samples[2], 1, "1-shot exp. with negative sample", test_texts, golden_labels, client, str2int)

In [127]:
# Samples formatted for one-shot experiments
three_shot_samples_test = sample_prompt_instances(train_dataset, num_samples=3, num_of_runs=3, seed=7)

test(three_shot_samples_test, num_samples=3, test_name="3-shot exp.", test_data=test_texts, golden_labels=golden_labels, client=client, str2int=str2int)