In [1]:
import os
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import time


In [2]:
ADA_FINETUNED = 'ada:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-03-10-25'
BABBAGE_FINETUNED = 'babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-06-02'
CURIE_FINETUNED = 'curie:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-35-14'

In [16]:
def avg(l):
    return sum(l) / len(l)

def sentence_prob(xiyi, normalize=False, until_first_period=False):
    fn = avg if normalize else sum
    end_idx = xiyi['tokens'].index(".")+1 if until_first_period else len(xiyi['token_logprobs'])
    
    return math.exp(fn(xiyi['token_logprobs'][1:end_idx]))

def generate_probability_csv(model_name, split='test'):
    fname = f"{split}_logprobs_{model_name}.json"
    with open(fname) as f:
        logprobs = json.load(f)
    fname_ending = f"{split}_logprobs_endings_{model_name}.json"
    with open(fname_ending) as f:
        logprobs_ending = json.load(f)

    qids = list({k.split('_')[0] for k in logprobs.keys()})
    columns = ["x_1","x_2","y_1","y_2","P(x_1, y_1)","P(x_1, y_2)","P(x_2, y_1)","P(x_2, y_2)",
               "P(x_1)", "P(x_2)", "P(y_1)", "P(y_2)",
               "P(y_1|x_1)","P(y_2|x_2)","P(x_1|y_1)","P(x_2|y_2)"]
    csv_lines = []

    for qid in qids:
#         if qid == '1368': continue 
        try:
            x1y1, x1y2 = logprobs[f"{qid}_0"]
            x2y1, x2y2 = logprobs[f"{qid}_1"]
        except KeyError:
            continue
        y1, y2 = logprobs_ending[qid]

        x1_text, y1_text = ''.join(x1y1['tokens']).split('. ', maxsplit=1)
        x2_text, y2_text = ''.join(x2y2['tokens']).split('. ', maxsplit=1)

        x1y1_prob = sentence_prob(x1y1, normalize=True)
        x1y2_prob = sentence_prob(x1y2, normalize=True)
        x2y1_prob = sentence_prob(x2y1, normalize=True)
        x2y2_prob = sentence_prob(x2y2, normalize=True)
        x1_prob = sentence_prob(x1y1, normalize=True, until_first_period=True)
        x2_prob = sentence_prob(x2y2, normalize=True, until_first_period=True)
        y1_prob = sentence_prob(y1, normalize=True)
        y2_prob = sentence_prob(y2, normalize=True)
        csv_lines.append([x1_text, x2_text, y1_text, y2_text, x1y1_prob, x1y2_prob, x2y1_prob, x2y2_prob,
                         x1_prob, x2_prob, y1_prob, y2_prob])

    df = pd.DataFrame(csv_lines, columns=columns[:12])
    df["P(y_1|x_1)"] = df["P(x_1, y_1)"] / (df["P(x_1, y_1)"] + df["P(x_1, y_2)"])
    df["P(y_2|x_2)"] = df["P(x_2, y_2)"] / (df["P(x_2, y_2)"] + df["P(x_2, y_1)"])
    df["P(x_1|y_1)"] = df["P(x_1, y_1)"] / (df["P(x_1, y_1)"] + df["P(x_2, y_1)"])
    df["P(x_2|y_2)"] = df["P(x_2, y_2)"] / (df["P(x_2, y_2)"] + df["P(x_1, y_2)"])
    if 'ft-user' in model_name:
        model_name = 'finedtuned_' + model_name.split(':')[0]
    df.to_csv(f"gpt3_probabilities_{model_name}_test_morecolumns.csv", index=False)
    return df

In [17]:
df = generate_probability_csv(CURIE_FINETUNED, 'test')

In [6]:
print('forward accuracy')
print(avg((len(df[df["P(y_1|x_1)"]>0.5])/len(df), len(df[df["P(y_2|x_2)"]>0.5])/len(df))))

forward accuracy
0.6448516579406631


In [7]:
print('backward accuracy')
print(avg((len(df[df["P(x_1|y_1)"]>0.5])/len(df), len(df[df["P(x_2|y_2)"]>0.5])/len(df))))

backward accuracy
0.5846422338568935


In [33]:
len(df)

729

In [15]:
math.exp(sum(x1y1['token_logprobs'][1:]))

1.2811109223052686e-11