In [1]:
%load_ext autoreload
%autoreload 2

import openai

import re
import glob
import os
import dotenv
dotenv.load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

from textflint import Engine
from textflint.adapter import Config, auto_config
from textflint.input.dataset import Dataset

from models.custom_models import OpenAIFlint, OpenAI_NLI_Flint

In [2]:
def find_all_data(dir='./transformed_data', skip_trans=None):
    dir_list = ' '.join(glob.glob(f'{dir}/*.json'))
    data_paths = {}
    
    for match in re.findall(r'ori_(\w+-?\w*)_(\d+).json', dir_list):
        name = match[0]

        if skip_trans and name in skip_trans:
            continue

        num  = match[1]
        ori_path = re.search(f'ori_{name}_{num}.json', dir_list).group(0)
        trans_path = re.search(f'trans_{name}_{num}.json', dir_list).group(0)
        data_paths[name] = f'{dir}/{ori_path}', f'{dir}/{trans_path}'
    
    return data_paths

def compare_evaluate(model, ori_data_path, trans_data_path, task, trans_name):
    ori_data = Dataset(task=task)
    trans_data = Dataset(task=task)

    ori_data.load_json(ori_data_path)
    trans_data.load_json(trans_data_path)

    print("EVAL ORI")
    ori_eval = model.evaluate(ori_data.dump(), prefix=f'{trans_name}_ori_')
    print("EVAL TRANS")
    trans_eval = model.evaluate(trans_data.dump(), prefix=f'{trans_name}_trans_')

    return ori_eval, trans_eval

In [10]:
SA_model_results = []

In [11]:
model = OpenAIFlint('gpt-3.5-turbo')

data_paths = find_all_data('./transformed_data/SA', skip_trans=None)
for trans_name in data_paths.keys():
    print(trans_name)
    results = compare_evaluate(model, *data_paths[trans_name], task='SA', trans_name=trans_name)
    SA_model_results.append(results)

[34;1mTextFlint[0m: ******Start load!******


AddSum-person


100%|██████████| 253/253 [00:00<00:00, 299.46it/s]
[34;1mTextFlint[0m: 253 in total, 253 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 253/253 [00:00<00:00, 299.23it/s]
[34;1mTextFlint[0m: 253 in total, 253 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


Invalid model response: "mixed". The prompt was: {'x': 'This DVD set is the complete widescreen 15-episode run of "Surface", a television show made by Universal in 2006. The full running time is 10 hours and 34 minutes plus a few bonus features (deleted scenes, cast interviews, special effects featurette). This was a relatively high budget show and much of the budget makes it to the screen in the form of quality production design and special effects. <br /><br />Unfortunately 10+ hours is a lot of time and as typically happens with this type of stuff, the overall quality begins to fall off in the later episodes. I found the first 7 episodes (Discs 1 and 2) extremely engaging and the remainder a disappointment. "Surface" was produced, written and directed by Josh and Jonas Pate; and it appears that they were surprised by the success of the series and unable to cobble together enough good subsequent material as they rushed to fill the order for additional episodes. It even looks like add

[34;1mTextFlint[0m: ******Start load!******


DoubleDenial


100%|██████████| 459/459 [00:00<00:00, 559.14it/s]
[34;1mTextFlint[0m: 459 in total, 459 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 459/459 [00:00<00:00, 758.79it/s]
[34;1mTextFlint[0m: 459 in total, 459 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


Invalid model response: "mixed". The prompt was: {'x': 'My first \'Columbo\'. Rather enjoyed it. Great format, and Peter Falk\'s character extremely good...wonderfully quirky, he can take his place next to Poirot, Miss Marple, and also the likes of Marlowe and Rick Diamond. I can see why this series has such a following. <br /><br />As a professional musician, I HAVE to say a few things. First of all, a conductor who merely produces these pedestrian performances of the most basic examples of the repertoire (Eine Kleine Nachtmusik, Strauss Waltzes, Beethoven...) is never going to have a house like that or fame like that or cars like that, much less be called a genius. And the conducting that the actor does is so bad as to be laughable. No orchestra would take him seriously. <br /><br />There are several little things too, such as his rehearsal of Eine Kleine Nachtmusik (why rehearse it when they\'ve just performed it for TV? Any orchestral musician would be able to play it in his or her

[34;1mTextFlint[0m: ******Start load!******


Ocr


100%|██████████| 100/100 [00:00<00:00, 822.06it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 100/100 [00:00<00:00, 825.68it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


Invalid model response: "mixed". The prompt was: {'x': 'This DVD set is the complete widescreen 15-episode run of "Surface", a television show made by Universal in 2006. The full running time is 10 hours and 34 minutes plus a few bonus features (deleted scenes, cast interviews, special effects featurette). This was a relatively high budget show and much of the budget makes it to the screen in the form of quality production design and special effects. <br /><br />Unfortunately 10+ hours is a lot of time and as typically happens with this type of stuff, the overall quality begins to fall off in the later episodes. I found the first 7 episodes (Discs 1 and 2) extremely engaging and the remainder a disappointment. "Surface" was produced, written and directed by Josh and Jonas Pate; and it appears that they were surprised by the success of the series and unable to cobble together enough good subsequent material as they rushed to fill the order for additional episodes. It even looks like add

[34;1mTextFlint[0m: ******Start load!******


AppendIrr


100%|██████████| 100/100 [00:00<00:00, 1001.45it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 100/100 [00:00<00:00, 981.67it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


Invalid model response: "neutral". The prompt was: {'x': "Director: Tay Garnett, Ford Beebe, Cast: Mike Mazurki, Vic Christy, Fritz Ford, Tay Garnett.<br /><br />Based on the number of comments I see on IMDb, this seems to be a forgotten movie. This seems rather ironic to me because it is actually one of the first movies that I remember. My mom took me and my little brother to see this film at The Garland theater in Spokane when it first came out in the mid 1970's and I still remember it.<br /><br />I am going by memory here but I believe this move is about a trapper who was accused of a crime which he did not commit and the law goes after him. I believe it to be set in 1800's Alaska. A narrator tells the story of the trapper played by Mike Mazurki. Really, this is a very good film with a great setting. It could be compared to the 1981 film Death Hunt with Charles Bronson. The two films have a very similar story line. The main difference between the two is Death Hunt is an adult orient

[34;1mTextFlint[0m: ******Start load!******


AddSum-movie


100%|██████████| 237/237 [00:00<00:00, 736.96it/s]
[34;1mTextFlint[0m: 237 in total, 237 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 237/237 [00:00<00:00, 632.66it/s]
[34;1mTextFlint[0m: 237 in total, 237 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


Invalid model response: "mixed". The prompt was: {'x': 'This DVD set is the complete widescreen 15-episode run of "Surface", a television show made by Universal in 2006. The full running time is 10 hours and 34 minutes plus a few bonus features (deleted scenes, cast interviews, special effects featurette). This was a relatively high budget show and much of the budget makes it to the screen in the form of quality production design and special effects. <br /><br />Unfortunately 10+ hours is a lot of time and as typically happens with this type of stuff, the overall quality begins to fall off in the later episodes. I found the first 7 episodes (Discs 1 and 2) extremely engaging and the remainder a disappointment. "Surface" was produced, written and directed by Josh and Jonas Pate; and it appears that they were surprised by the success of the series and unable to cobble together enough good subsequent material as they rushed to fill the order for additional episodes. It even looks like add

[34;1mTextFlint[0m: ******Start load!******


WordCase_upper


100%|██████████| 100/100 [00:00<00:00, 920.17it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 100/100 [00:00<00:00, 1228.32it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


Invalid model response: "mixed". The prompt was: {'x': 'This DVD set is the complete widescreen 15-episode run of "Surface", a television show made by Universal in 2006. The full running time is 10 hours and 34 minutes plus a few bonus features (deleted scenes, cast interviews, special effects featurette). This was a relatively high budget show and much of the budget makes it to the screen in the form of quality production design and special effects. <br /><br />Unfortunately 10+ hours is a lot of time and as typically happens with this type of stuff, the overall quality begins to fall off in the later episodes. I found the first 7 episodes (Discs 1 and 2) extremely engaging and the remainder a disappointment. "Surface" was produced, written and directed by Josh and Jonas Pate; and it appears that they were surprised by the success of the series and unable to cobble together enough good subsequent material as they rushed to fill the order for additional episodes. It even looks like add

[34;1mTextFlint[0m: ******Start load!******


TwitterType_random


100%|██████████| 100/100 [00:00<00:00, 1021.09it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 100/100 [00:00<00:00, 952.98it/s]
[34;1mTextFlint[0m: 100 in total, 100 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


Invalid model response: "neutral". The prompt was: {'x': 'My first \'Columbo\'. Rather enjoyed it. Great format, and Peter Falk\'s character extremely good...wonderfully quirky, he can take his place next to Poirot, Miss Marple, and also the likes of Marlowe and Rick Diamond. I can see why this series has such a following. <br /><br />As a professional musician, I HAVE to say a few things. First of all, a conductor who merely produces these pedestrian performances of the most basic examples of the repertoire (Eine Kleine Nachtmusik, Strauss Waltzes, Beethoven...) is never going to have a house like that or fame like that or cars like that, much less be called a genius. And the conducting that the actor does is so bad as to be laughable. No orchestra would take him seriously. <br /><br />There are several little things too, such as his rehearsal of Eine Kleine Nachtmusik (why rehearse it when they\'ve just performed it for TV? Any orchestral musician would be able to play it in his or h

In [13]:
SA_model_results

[({'AddSum-person_ori__Accuracy': 0.9011857707509882},
  {'AddSum-person_trans__Accuracy': 0.8656126482213439}),
 ({'DoubleDenial_ori__Accuracy': 0.9106753812636166},
  {'DoubleDenial_trans__Accuracy': 0.8867102396514162}),
 ({'Ocr_ori__Accuracy': 0.93}, {'Ocr_trans__Accuracy': 0.92}),
 ({'AppendIrr_ori__Accuracy': 0.9}, {'AppendIrr_trans__Accuracy': 0.86}),
 ({'AddSum-movie_ori__Accuracy': 0.9156118143459916},
  {'AddSum-movie_trans__Accuracy': 0.8945147679324894}),
 ({'WordCase_upper_ori__Accuracy': 0.9},
  {'WordCase_upper_trans__Accuracy': 0.87}),
 ({'TwitterType_random_ori__Accuracy': 0.89},
  {'TwitterType_random_trans__Accuracy': 0.93})]

In [7]:
NLI_model_results = []

In [8]:
model = OpenAI_NLI_Flint('gpt-3.5-turbo', batch_size=100)

data_paths = find_all_data('./transformed_data/NLI', skip_trans=None)
for trans_name in data_paths.keys():
    results = compare_evaluate(model, *data_paths[trans_name], task='NLI', trans_name=trans_name)
    NLI_model_results.append(results)

[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 99/99 [00:00<00:00, 79562.39it/s]
[34;1mTextFlint[0m: 99 in total, 99 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 99/99 [00:00<00:00, 111055.39it/s]
[34;1mTextFlint[0m: 99 in total, 99 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 99/99 [00:00<00:00, 108785.98it/s]
[34;1mTextFlint[0m: 99 in total, 99 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 99/99 [00:00<00:00, 114013.21it/s]
[34;1mTextFlint[0m: 99 in total, 99 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 99/99 [00:00<00:00, 49035.91it/s]
[34;1mTextFlint[0m: 99 in total, 99 were loaded successful.
[34;1mTextFli

Invalid model response: "neutral the hypothesis is unrelated to the premise". The prompt was: {'premise': 'How are you doing? A blond-haired doctor and her African american assistant looking threw new medical manuals. A man becomes learned by asking questions.', 'hypothesis': 'Well, let ’s get started. A doctor is looking at a book'}
Invalid model response: "neutral the hypothesis is unrelated to the premise". The prompt was: {'premise': 'How are things? A child wearing a red top is standing behind a blond headed child sitting in a wheelbarrow. Many heads are better than one.', 'hypothesis': 'Before I forget, I wanted to talk to you something. A child wearing a red top is standing behind a blond headed child Doing is better than saying.'}


[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 99/99 [00:00<00:00, 102149.10it/s]
[34;1mTextFlint[0m: 99 in total, 99 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 99/99 [00:00<00:00, 110552.74it/s]
[34;1mTextFlint[0m: 99 in total, 99 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


In [9]:
NLI_model_results

[({'TwitterType_random_ori__Accuracy': 0.7171717171717171},
  {'TwitterType_random_trans__Accuracy': 0.6868686868686869}),
 ({'WordCase_upper_ori__Accuracy': 0.7272727272727273},
  {'WordCase_upper_trans__Accuracy': 0.696969696969697}),
 ({'AppendIrr_ori__Accuracy': 0.7272727272727273},
  {'AppendIrr_trans__Accuracy': 0.5656565656565656}),
 ({'Ocr_ori__Accuracy': 0.7171717171717171},
  {'Ocr_trans__Accuracy': 0.6565656565656566})]

In [55]:
from textflint.input.model.metrics.metrics import accuracy_score as Accuracy

def get_output(content):
    content = content.lower()[:-1] # remove last character (punctutation)
    if content in model.label2id.keys(): return model.label2id[content]
    else:
        print('invalid response')
        return -1

results = {}
for k in model.responses.keys():
    dset_type = 'ori' if 'ori' in k else 'trans'
    trans_type = re.search('[A-Za-z]+', k).group(0)
    contents = [response.choices[0].message.content for response in model.responses[k]]
    outputs = [get_output(c) for c in contents]
    
    filename = re.search(f'{dset_type}_{trans_type}_\d+.json', ' '.join(glob.glob('./transformed_data/NLI/*'))).group(0)
    d = Dataset('NLI')
    d.load_json('./transformed_data/NLI/' + filename)
    labels = model.unzip_samples(d.dump())[1]
    results[k] = Accuracy(outputs, labels)


[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid respon

100%|██████████| 988/988 [00:00<00:00, 114259.74it/s]
[34;1mTextFlint[0m: 988 in total, 988 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid respon

100%|██████████| 988/988 [00:00<00:00, 146218.28it/s]
[34;1mTextFlint[0m: 988 in total, 988 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response


100%|██████████| 90/90 [00:00<00:00, 151358.20it/s]
[34;1mTextFlint[0m: 90 in total, 90 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response


100%|██████████| 90/90 [00:00<00:00, 116221.48it/s]
[34;1mTextFlint[0m: 90 in total, 90 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response


100%|██████████| 465/465 [00:00<00:00, 142872.42it/s]
[34;1mTextFlint[0m: 465 in total, 465 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response


100%|██████████| 465/465 [00:00<00:00, 126097.59it/s]
[34;1mTextFlint[0m: 465 in total, 465 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid respon

100%|██████████| 988/988 [00:00<00:00, 143947.91it/s]
[34;1mTextFlint[0m: 988 in total, 988 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******
[34;1mTextFlint[0m: ******Start load!******


invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response
invalid response


100%|██████████| 988/988 [00:00<00:00, 130498.26it/s]
[34;1mTextFlint[0m: 988 in total, 988 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


In [54]:
results

{'BackTrans_ori_': 0.6133603238866396,
 'BackTrans_trans_': 0.5536437246963563,
 'NumWord_ori_': 0.6,
 'NumWord_trans_': 0.25555555555555554,
 'SwapAnt_ori_': 0.9118279569892473,
 'SwapAnt_trans_': 0.6645161290322581,
 'AddSent_ori_': 0.6153846153846154,
 'AddSent_trans_': 0.3390688259109312}

In [13]:
content[0][:-1]

'Contradiction'

In [19]:
ori_data = Dataset(task='NLI')
ori_data.load_json('./output/ori_NumWord_90.json')

[34;1mTextFlint[0m: ******Start load!******
100%|██████████| 4470/4470 [00:05<00:00, 818.44it/s]
[34;1mTextFlint[0m: 4470 in total, 4470 were loaded successful.
[34;1mTextFlint[0m: ******Finish load!******


In [20]:
unzipped = model.unzip_samples(ori_data.dump()[:100])
batch_inputs, batch_labels = unzipped

len(ori_data.dump())

4470