In [1]:
__author__ = "Mars Gokturk Buchholz"
__version__ = "CS224u, Stanford, Winter 2023"

In [2]:
import openai
import pandas as pd
from open_ai_client import LiarDataset

[34m[1mwandb[0m: Currently logged in as: [33mmars-gokturk[0m ([33mmars-works[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
openai.api_key = "###"

In [4]:
test_ds = LiarDataset("../input_data/test.tsv", "liar_test")
test_df = test_ds.get()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1267 entries, 0 to 1266
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1267 non-null   object
 1   label              1267 non-null   object
 2   statement          1267 non-null   object
 3   subject            1267 non-null   object
 4   speaker            1267 non-null   object
 5   speaker_job_title  942 non-null    object
 6   state_info         1005 non-null   object
 7   party_affiliation  1267 non-null   object
 8   barely_true_c      1267 non-null   int64 
 9   half_true_c        1267 non-null   int64 
 10  false_c            1267 non-null   int64 
 11  mostly_true_c      1267 non-null   int64 
 12  pantsonfire_c      1267 non-null   int64 
 13  context            1250 non-null   object
dtypes: int64(5), object(9)
memory usage: 138.7+ KB


For zero-shot prompt, we don't need to process the dataset. We will only use the statement column.

In [5]:
def get_completion(statement:str):    
    p =f"""
        Act as a research journalist doing fact-checking on the statements given by politicians. You will label each 
        statement as one of "true, mostly-true, half-true, barely-true, false or pants-fire". 
        Use below metric to label each statement:

        If the statement is accurate and there’s nothing significant missing, then label as "true"
        If the statement is accurate but needs clarification or additional information, then label as "mostly-true"
        If the statement is partially accurate but leaves out important details or takes things out of context, then label as "half-true"
        If the statement contains an element of truth but ignores critical facts that would give a different impression, then label as "barely-true"
        If the statement is not accurate, then label as "false"
        If the statement is not accurate and makes a ridiculous claim, then label as "pants-fire"

        Use only factual data and reliable sources such as major news outlets and fact-checking organizations in the USA.
        Provide evidence for the labels and the name, date and author of the evidence for your answer.  If you don't have any evidence, say "I don't have any evidence". 
        You answer should be in this format: "Label: <statement label>, Evidence: <evidence>"
        ---
        {statement}
        """    
    
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      temperature=0.0,    
      messages=[{"role": "user", "content":p }])  
 
    
    response_state = response["choices"][0]["finish_reason"]
    label = ""
    evidence = ""
    if response_state != "stop":
        return "invalid_response"
    else:
        response_text = response["choices"][0]["message"]["content"]
        
        if "label" in response_text.lower():
            label = response_text.lower().split("label:")[1].split("evidence")[0].lower().strip()
        if "evidence" in response_text.lower():
            evidence = response_text.lower().split("evidence:")
            if len(evidence) > 1:
                evidence = evidence[1].strip()

        if "pants-fire" in label:
            label = "pants-fire"
        elif "false" in label:
            label= "false"
        elif "mostly-true" in label:
            label= "mostly-true"
        elif "barely-true" in label:
            label= "barely-true"
        elif "half-true" in label:
            label= "half-true"
        elif "true" in label:
            label= "true" 
        else:
            label= "undefined"
            
        return label, evidence
    

In [6]:
# test
l, e = get_completion("Medicaid spending declined by 1.9 percent in 2012, the second such decline in 47 years.")
l, e

Label = mostly-true, evidence = according to a report by the kaiser family foundation, medicaid spending did decline by 1.9 percent in 2012, which was the second decline in 47 years. however, the report also notes that the decline was largely due to a decrease in enrollment and utilization, rather than a decrease in spending per enrollee. (source: kaiser family foundation, "medicaid spending and enrollment growth: fy 2012 & 2013," september 2014)


('mostly-true',
 'according to a report by the kaiser family foundation, medicaid spending did decline by 1.9 percent in 2012, which was the second decline in 47 years. however, the report also notes that the decline was largely due to a decrease in enrollment and utilization, rather than a decrease in spending per enrollee. (source: kaiser family foundation, "medicaid spending and enrollment growth: fy 2012 & 2013," september 2014)')

In [8]:
test_df = test_df.reset_index()
test_df.index

RangeIndex(start=0, stop=1267, step=1)

## Zero-shot with prompt
If an exception arises from the API, the data collected up to that point will be written to a file. You can then resume from where it left off by adjusting the index.

In [10]:
import time
from tqdm import tqdm

idx = []
labels = []
evidences = []
for i in tqdm(range(77, test_df.shape[0])):    
    st = test_df.iloc[i]["statement"]
    try:
        label, evidence = get_completion(st)
        idx.append(i)
        labels.append(label)
        evidences.append(evidence)
        time.sleep(5)
    except Exception as e:
        print(e)
        a = pd.DataFrame({"idx": idx, "label": labels, "evidence": evidences})
        a.to_csv(f"result_between_77_until_{i}.csv")
        break

100%|█████████████████████████████████████| 1190/1190 [3:07:37<00:00,  9.46s/it]


In [11]:
b = pd.DataFrame({"idx": idx, "label": labels, "evidence": evidences})
b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idx       1190 non-null   int64 
 1   label     1190 non-null   object
 2   evidence  1190 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.0+ KB


In [12]:
a = pd.read_csv("../output_data/result_until_77.csv", index_col=0)
a.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 0 to 76
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idx       77 non-null     int64 
 1   label     77 non-null     object
 2   evidence  77 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.4+ KB


In [13]:
all_tr = pd.concat([a,b])
all_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1267 entries, 0 to 1189
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idx       1267 non-null   int64 
 1   label     1267 non-null   object
 2   evidence  1267 non-null   object
dtypes: int64(1), object(2)
memory usage: 39.6+ KB


In [14]:
all_tr.head(1)

Unnamed: 0,idx,label,evidence
0,0,True,"""according to a report by the government accou..."


In [16]:
all_tr = all_tr.rename(columns={"label": "predicted_label"})

In [17]:
all_tr.to_csv("../output_data/all_test_results.csv")