In [None]:
from openai import OpenAI
import pickle, sys, copy, pandas, re, math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

: 

In [None]:
from bokeh.plotting import figure, show

In [None]:
# Data path - updated to use Downloads folder
DATA_PATH = '/Users/michalprusek/Downloads/data'

In [None]:
# Create an OpenAI client with your deepinfra token and endpoint
openai = OpenAI(
    api_key="Sgh76eVtGLUcsBj8jOFgNEkRxKtRszzB", # please only use this for lab purposes; there is a strict usage limit on it
    base_url="https://api.deepinfra.com/v1/openai",
)

In [None]:
def query_llm(prompt, temperature = 0.0, max_tokens = None):
    if max_tokens is not None and max_tokens <= 0:
        print(f'ERROR: invalid max_tokens number: {max_tokens}')
        max_tokens = None
    chat_completion = openai.chat.completions.create(
        model="google/gemma-3-4b-it",
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return chat_completion.choices[0].message.content

In [None]:
train_doc_ids = pandas.read_csv(f'{DATA_PATH}/claudette_train.tsv', sep='\t')['document'].unique()
val_doc_ids = pandas.read_csv(f'{DATA_PATH}/claudette_val.tsv', sep='\t')['document'].unique()
test_doc_ids = pandas.read_csv(f'{DATA_PATH}/claudette_test.tsv', sep='\t')['document'].unique()

In [None]:
df = pandas.read_csv(f'{DATA_PATH}/tos_dataset.csv')
df_train = df.loc[df['document'].isin(train_doc_ids)]
df_train_neg = df_train.loc[df_train['label'] == 0]
df_train_pos = df_train.loc[df_train['label'] == 1]
df_val = df.loc[df['document'].isin(val_doc_ids)]
df_test = df.loc[df['document'].isin(test_doc_ids)]
unfairness_categories = ['A', 'CH', 'CR', 'J', 'LAW', 'LTD', 'TER', 'USE']

In [None]:
df_dev_neg = df_train.loc[df_train['label'] == 0]
df_dev_pos = df_train.loc[df_train['label'] == 1]

In [None]:
df_train_pos = df_train.loc[df_train['label'] == 1]
df_train_pos_per_cat = {}
for category in unfairness_categories:
    df_train_pos_per_cat[category] = df_train.loc[df_train[category] == 1]

In [None]:
def make_answer_instruction(n_words = 50):
    return f'Start your answer with "yes" or "no" and then justify your response in no more than {n_words} words.'

In [None]:
yes_res = [r'^[\s"]?[Yy]es[\.,\s]']

system_prompt = 'You are a legal expert on consumer protection law. Consider the following online terms of service clause: "'

legal_standards = {
    'A': {
        'fairness_q': 'Does this clause describe an arbitration dispute resolution process that is not fully optional to the consumer?'
    },
    'CH': {
        'fairness_q': 'Does this clause specify conditions under which the service provider could amend and modify the terms of service and/or the service itself?'
    },
    'CR': {
        'fairness_q': "Does this clause indicate conditions for content removal in the service provider's full discretion, and/or at any time for any or no reasons and/or without notice nor possibility to retrieve the content."
    },
    'J': {
        'fairness_q': "Does this clause state that any judicial proceeding is to be conducted in a place other than the consumer's residence (i.e. in a different city, different country)?"
    },
    'LAW': {
        'fairness_q': 'Does the clause define the applicable law as different from the law of the consumer's country of residence?'
    },
    'LTD': {
        'fairness_q': 'Does this clause stipulate that duties to pay damages by the provider are limited or excluded?'
    },
    'TER': {
        'fairness_q': 'Does this clause stipulate that the service provider may suspend or terminate the service at any time for any or no reasons and/or without notice?'
    },
    'USE': {
        'fairness_q': 'Does this clause stipulate that the consumer is bound by the terms of use of a specific service, simply by using the service, without even being required to mark that he or she has read and accepted them?'
    },
}

In [None]:
print(f"{len(train_doc_ids)} / {len(val_doc_ids)} / {len(test_doc_ids)}")
print(f"{len(df_train)} / {len(df_val)} / {len(df_test)}")

In [None]:
def sample_dataset(df, category = None, df_neg = None, balance = True, random_negatives = False, max_pos_n = None, seed = None):
    if category is not None:
        pos_dset = df.loc[df[category] == 1]
    else:
        pos_dset = df.loc[df['label'] == 1]
    if max_pos_n is None or max_pos_n > len(pos_dset):
        max_pos_n = len(pos_dset)
    if max_pos_n < len(pos_dset):
        pos_dset = pos_dset.sample(max_pos_n, random_state=seed)
    if random_negatives:
        if balance:
            neg_dset = df_train.loc[df_train[category] == 0].sample(len(pos_dset), random_state=seed)
        else:
            neg_dset = df_train.loc[df_train[category] == 0]
    elif df_neg is not None:
        pass # to be extended
    else:
        sys.exit('ERROR: either allow random negatives or provide `df_neg`')
    return pos_dset, neg_dset

In [None]:
def evaluate_prompt(pos_dset, neg_dset, prompt, response_res = yes_res, default_label = 0, extract_label = 1, log = False):
    dsets = [[pos_dset, 1], 
             [neg_dset, 0]]
    if log: print(f'{len(pos_dset)} / {len(neg_dset)} positive / negative samples')
    tp, fp, tn, fn = 0, 0, 0, 0
    fps, fns = [], []
    for df, label in dsets:
        for i, ex in df.iterrows():
            index = ex.iloc[0]
            if log: print(f'data point @ index {index}')
            prompt = (system_prompt + ex['text'] + '\n' + prompt +' '+make_answer_instruction())
            gen_text = query_llm(prompt)
            if log: print("P: "+prompt)
            if log: print("R: "+gen_text)
            is_unfair = default_label
            for rex in response_res:
                if re.search(rex, gen_text) is not None:
                    is_unfair = extract_label
            if is_unfair:
                if log: print(f'=> Unfair: {gen_text[:50]}')
                if label == 1:
                    tp += 1
                else:
                    fp += 1
                    if log: print(f'false positive! @{index}')
                    fps.append({'clause': ex['text'],
                                'index': index,
                                'prompt': prompt,
                                'response': gen_text,
                                'true_label': 0,
                                'pred_label': 1,
                                'gradient': 'fp',
                               })
            else:
                if log: print(f'Fair: {gen_text[:50]}')
                if label == 1:
                    fn += 1
                    if log: print(f'false negative! @{index}')
                    fns.append({'clause': ex['text'], 
                                'index': index,
                                'prompt': prompt,
                                'response': gen_text,
                                'true_label': 1,
                                'pred_label': 0,
                                'gradient': 'fn',
                               })
                else:
                    tn += 1
            if log: print('===')
    if log: print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
    acc = (tp+tn) / (tp+tn+fp+fn)
    prec = tp / (tp+fp) if tp+fp > 0 else 0.0
    rec = tp / (tp+fn) if tp+fn > 0 else 0.0
    f1 = 2* (prec * rec)/(prec + rec) if prec+rec > 0.0 else 0.0
    return {'pos_n': len(pos_dset),
            'neg_n': len(neg_dset),
            'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn,
            'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1,
            'fps': fps, 'fns': fns,
           }

In [None]:
def category_experiment(df, legal_standards, df_neg = None, categories = None, max_gen_len = 50, random_negatives = True, max_pos_n = None, log=True):
    if categories is None:
        categories = legal_standards.keys()
    results = {}
    for category in categories:
        if log: print(f'=== Testing for category `{category}` ===')
        ls = legal_standards[category]
        pos_sample, neg_sample = sample_dataset(df,
                                                category,
                                                df_neg = df_neg,
                                                random_negatives=random_negatives,
                                                max_pos_n=max_pos_n,
                                                seed=42)
        results[category] = evaluate_prompt(pos_sample,
                                            neg_sample,
                                            ls['fairness_q'],
                                            log=log)
    return results

In [None]:
def results_report(r):
    for label in r.keys():
        print(label)
        print(f"- pos_n {r[label]['pos_n']}")
        print(f"- neg_n {r[label]['neg_n']}")
        print(f"- prec {r[label]['prec']}")
        print(f"- rec {r[label]['rec']}")
        print(f"- f1 {r[label]['f1']}")
        print(f"- TP {r[label]['TP']} TN {r[label]['TN']} FP {r[label]['FP']} FN {r[label]['FN']}")

In [None]:
# Run experiments for each unfairness category
# WARNING: This will make API calls and may take some time!
results = {}
for uc in unfairness_categories:
    results[uc] = category_experiment(df_train_pos, legal_standards, df_neg = df_train_neg, random_negatives= True, categories=[uc])

In [None]:
# Display results summary
for category in unfairness_categories:
    print(f"\n{'='*50}")
    print(f"Category: {category}")
    print(f"{'='*50}")
    results_report(results[category])