In [14]:
import pandas as pd
import string
import os 
import config
import prompts
from dotenv import load_dotenv
load_dotenv()
import  nest_asyncio
nest_asyncio.apply()


source_pdf = pd.read_pickle(os.path.join(os.getcwd(), "..", "data", "eval", "src_example.pkl"))
data = pd.read_pickle(os.path.join(os.getcwd(), "..", "data", "eval", "example.pkl"))
data['sentences'] = source_pdf[data['indices']].reset_index(drop = True)

def join_sentences(row):
    index = row['indices']
    context = config.CONTEXT_SENTENCES
    sentences = source_pdf[index - context if index >= context else 0 : index]
    context =  ''.join([s + '\n' if len(s) != 0 and s[-1] not \
                     in string.punctuation else s + ' ' for s in sentences])
    return context

def create_prompt(row):
    error = row['errors']
    context = row['contexts']
    sentence = row['sentences']
    prompt_template = prompts.ERRORS[error].format(context = context,
                                                  completion = sentence)
    return prompt_template

data['contexts'] = data.apply(join_sentences, axis=1)
data['prompts'] = data.apply(create_prompt, axis = 1)

data = data.to_dict('list')

In [15]:
# from langchain.llms import HuggingFaceHub

# hf_model_name = 'HuggingFaceH4/zephyr-7b-alpha'
# # openai = OpenAI(model_name='gpt-3.5-turbo-instruct', max_tokens=-1)
# hf = HuggingFaceHub(repo_id=hf_model_name, model_kwargs= {'max_new_tokens': 250,
#                                                           })
# outputs = []
# for i, prompt in enumerate(data['prompts']):
#     output = hf.invoke(data['prompts'][i])
#     outputs.append(output)

In [16]:
# Parse the outputs into multiple completion choices
def parse_text(text):
    lines = text.split('\n')
    hal_choices = []
    i = 0

    while i < len(lines):
        line = lines[i].strip()

        # Check if the line starts with a number
        if line and line[0].isdigit():
            # Check if the line contains "Change:"
            if "Change:" in line:
                i += 1

                # Check if the next line contains "Contradiction"
                if i < len(lines) and "Contradiction:" in lines[i]:
                    contradiction = lines[i].split("Contradiction:", 1)[1].strip()
                    i += 1
                    hal_choices.append(contradiction)
            else:
                # Line starts with a number but doesn't contain "Change:", move to the next line
                i += 1
        else:
            # Line doesn't start with a number, move to the next line
            i += 1

    return hal_choices


In [17]:
# data['choices'] = [parse_text(text) for text in outputs]
import json
with open('data.json', 'r') as json_file:
    data['choices'] =  json.load(json_file)


In [None]:
from huggingface_hub import AsyncInferenceClient
from transformers import AutoTokenizer
import asyncio
import numpy as np

API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
MODEL = 'HuggingFaceH4/zephyr-7b-alpha'

MIN_PROP = 0.7
data['choices'] = [[choice.strip() for choice in choices if len(choice) >= MIN_PROP * len(s)] for s, \
                   choices in zip(data['sentences'], data['choices'])]

# Check if two sentences contradict
async def contradictory_score(i, sentence, choices, model='microsoft/deberta-v2-xlarge-mnli'):
    if len(choices) == 0:
        return i, []
    prompts = ['[CLS]' + sentence + '[SEP]' + choice + '[SEP]' for choice in choices]
    client = AsyncInferenceClient(model=model, token= API_TOKEN)
    results = [await client.text_classification(prompt) for prompt in prompts]
    contradiction_scores = []
    # Iterate through the list and extract scores for 'CONTRADICTION'
    for result in results:
        for item in result:
            if item['label'] == 'CONTRADICTION':
                contradiction_scores.append(item['score'])
    
    return i, contradiction_scores

async def get_nml_score(template, context, context_len, choice, model=MODEL):
    prompt = template.format(c=context, s=choice)
    client = AsyncInferenceClient(model=model, token=API_TOKEN)
    response = await client.text_generation(prompt, max_new_tokens=1,
                                                details=True, decoder_input_details=True)
    logit_scores = [x.logprob for x in response.details.prefill[context_len:]]
    try:
        nml_score = sum(logit_scores) / len(logit_scores)
    except Exception as e:
        print(context)
        print()
        print('Here are the choices')
        print(choice)
        print(len(choice))
        return e
    return nml_score

async def fluency_score_difference(i, sentence, context, context_len, choices):    
    if len(choices) == 0:
        return i, []
    print(i)
    template = '{c} {s}'
    # Normalized logit for original prompt
    org_prompt_n_logit = await get_nml_score(template, context, context_len, sentence)
    # Normalized logits for choices
    prompt_choices_logits = [await get_nml_score(template, context, context_len, choice) for choice in choices]
    # Get the difference
    return i, (np.array(prompt_choices_logits) - org_prompt_n_logit).tolist()

start = 0
limit = 10
end = start + limit

contradictory_scores = []
fluency_score_differences = []

async def process_data(start, end):
    global contradictory_scores
    global fluency_score_differences
    # Get the contradictory score:
    results_0 = await asyncio.gather(*[contradictory_score(i + start, s, c) for i, (s, c) in enumerate(zip(data['sentences'][start:end], data['choices'][start:end]))])
    # Get the fluency difference between choice completion and original completion.
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    context_lens = [len(x) for x in tokenizer(data['contexts'][start:end])['input_ids']]
    results_1 = await asyncio.gather(*[fluency_score_difference(i + start, s, ct, ctl, c) for i, (s, ct, ctl, c) in enumerate(zip(data['sentences'][start:end], data['contexts'][start:end], context_lens, data['choices'][start:end]))])
    contradictory_scores += results_0
    fluency_score_differences += results_1

async def main():
    global start
    global end
    global data
    while start < len(data['errors']):
        await process_data(start, end)
        start += 10
        end = min(start + limit, len(data['errors']))

    data['contradiction_scores'] = [x[1] for x in sorted(contradictory_scores, key=lambda x: x[0])]
    data['fluency_score_difference'] = [x[1] for x in sorted(fluency_score_differences, key=lambda x: x[0])]

asyncio.run(main())



In [None]:
# Filter usable data completion

## if error is not coreference then contradiction score needs
## to be more than phi_a_n else phi_a_c. Fluency difference
## need to be more than phi_b (should be negative number)
PHI_A_N = 0.6
PHI_A_C = 0.3
PHI_B = -0.25

def selected_choices(error, c_scores, f_score_diffs, choices):
    filtered_choices = []
    for i,choice in enumerate(choices):
        if f_score_diffs[i] < PHI_B:
            continue
        if error == 'COREFERENCE':
            if c_scores[i] >= PHI_A_C:
                filtered_choices.append(choice)
        else:
            if c_scores[i] >= PHI_A_N:
                filtered_choices.append(choice)
    return filtered_choices

filter_choices = [selected_choices(e, c_s, f_s_d, c) for e,c_s, f_s_d, c \
                  in zip(data['errors'], data['contradiction_scores'], data['fluency_score_difference'],
                         data['choices'])]
chosen_data_indices = [i for i,c in enumerate(filter_choices) if len(c) != 0 ]

chosen_data = {
    'errors': [data['errors'][i] for i in chosen_data_indices],
    'contexts': [data['contexts'][i] for i in chosen_data_indices],
    'sentences': [data['sentences'][i] for i in chosen_data_indices],
    'choices': [filter_choices[i] for i in chosen_data_indices],
}

In [3]:
from huggingface_hub import InferenceClient
import os
from dotenv import load_dotenv
load_dotenv()
API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
client = InferenceClient(model='mistralai/Mistral-7B-Instruct-v0.1')
client.text_generation('Hello ', details=True)

TextGenerationResponse(generated_text="👋\n\nI'm a newbie to the world of programming and I'", details=Details(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=20, seed=None, prefill=[], tokens=[Token(id=243, text='', logprob=-1.1025391, special=False), Token(id=162, text='', logprob=-0.0001193285, special=False), Token(id=148, text='', logprob=-0.83496094, special=False), Token(id=142, text='', logprob=-0.0074272156, special=False), Token(id=13, text='', logprob=-0.9902344, special=False), Token(id=13, text='\n\n', logprob=-0.21704102, special=False), Token(id=28737, text='I', logprob=-1.1689453, special=False), Token(id=28742, text="'", logprob=-1.3085938, special=False), Token(id=28719, text='m', logprob=-0.21057129, special=False), Token(id=264, text=' a', logprob=-2.0605469, special=False), Token(id=633, text=' new', logprob=-2.5039062, special=False), Token(id=12868, text='bie', logprob=-1.3720703, special=False), Token(id=298, text=' to', logprob=-1.0986328, specia