In [62]:
import openai
import time
import json
import transformers
from transformers import GPT2Tokenizer
import backoff
from openai.error import RateLimitError
import datetime
import pickle
import numpy as np
from numpy.linalg import norm
import os

QA generation using semantic search of the full textbook using GPT-3 embeddings and completions

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [209]:
def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()

In [210]:
openai.api_key=openai.api_key=os.getenv("OPENAI_API_KEY")

In [6]:
def gpt3_embedding(content, model='text-similarity-ada-001'):
    try:
        response = openai.Embedding.create(input=content, engine=model)
    except openai.error.APIConnectionError:
        print("Failed") 
    return response['data'][0]['embedding'] 

In [7]:
#compute cosine similarity
def get_similarity(v1, v2):
    cosine = np.dot(v1, v2)/(norm(v1)*norm(v2))
    return cosine

In [121]:
#searching through textbook 
def search_index(query, data, count=1):
    question_vector = gpt3_embedding(query)
    scores = []
    for i in data:
        score = get_similarity(question_vector, i['vector'])
        scores.append({'content' : i['content'], 'score' : score})
    most_relevant= sorted(scores, key=lambda d: d['score'], reverse=True)
    return most_relevant[0]

In [63]:
@backoff.on_exception(backoff.expo, RateLimitError)
def response_API(prompt, myKwargs = {}):

  #default arguments to send the API, unless changed in function
  kwargs = {"model" :"text-davinci-002",
            "temperature" :0.46,
            "max_tokens": 300,
            "best_of" :5,
            "n" :3,
            "frequency_penalty":0,
            "presence_penalty":0}


  for kwarg in myKwargs:
    kwargs[kwarg] = myKwargs[kwarg]

  r = openai.Completion.create(prompt=prompt, **kwargs)
  return r['choices'][0]['text']

In [34]:
def question_completions_with_backoff(passages): 
       
    question_prompts = ['''Generate 5 interactive and coherent questions about this context. The questions should not be repeated from the previous step. The questions should consist of reasoning and procedural steps. \n
                        The questions should be precise and factual. Start the question with a '[Q]' ''',
                        
                        '''Generate 5 objective, concise and firm questions about this context. The questions should not be repeated from the previous step. \n
                        The questions should begin with any of Why/How/Where/Who/When. Start the question with a '[Q]' ''' ,
                        
                        '''Generate 5 thoughtful and compelling, steps-based procedural questions about this context that start with Why or How. The questions should not be repeated from the previous step. \n
                        The questions should be unique and creative with an abstract and subjective aspect. Start the question with a '[Q]' ''' ]
    
    n=len(question_prompts)
    questions = []
    for p in passages:
        for j in question_prompts:
                #prompt_tokens = calculate_tokens(j)
                #context_tokens = calculate_tokens(p)
                #max_tokens = 300
                
                #while(max_tokens+prompt_tokens+context_tokens < 4096):
                prompt= "%s \n %s" % (j, p)
    
                response = response_API(prompt)
                
                questions.append(response)
                print(response)
                      
    question_list = [questions[i:i + n] for i in range(0, len(questions), n)]
    
    return question_list 

In [42]:
def get_answer(question, data):
    #most relevant passages
    result = search_index(question, data) #get most relevant passages where answer could be
    prompt = "PASSAGE - %s \n QUESTION - %s \nAnswer this question in 2-3 concise sentences based on the passage. Be objective in the answer given and explain in a few lines only.\n" % (result['content'], question)
    answer = response_API(prompt)
    print(answer)
    
    return answer, result['content']

start here

In [127]:
#sections data
s = open("GPT-3_section_level.json")
sections_data = json.load(s)

#full textbook embeddings - vectors
with open("index.json") as input_file:
    data = json.load(input_file)

In [36]:
#make a list of only texts from the json file
sections_list = []
for p, item in enumerate(sections_data):
    subtext = item['positive_ctxs']['text']
    sections_list.append(subtext)

In [None]:
#Generating questions 
question_list = question_completions_with_backoff(sections_list) 

In [38]:
#formatting questions
subq = []
for i in question_list:
    q = str(i).split('[Q]')
    subq.append(q)
    
questions_1d = []

for row in range(len(subq)):
    for col in range(len(subq[row])):
        questions_1d.append(subq[row][col])


In [171]:
#questions cleaning - 

def clean_questions(questions_1d):

    questions_cleaned = []
    questions_cleaned = [item for item in questions_1d if item!= """['\\n""" and item!= """['""" and item!= """['\\n\\n"""
                        and item!="""["\\n""" and item!="""['\\n\\nQ: """ and item!= """["\\n\\n"""
                        and item!= """['Q: """]

    ques = []
    for i in questions_cleaned:
        q = i.split('Q:')
        ques.append(q)

    questions_fin = []

    for row in range(len(ques)):
        for col in range(len(ques[row])):
            questions_fin.append(ques[row][col])

    questions = [item for item in questions_fin if item!= """['\\n""" and item!= """['""" and item!= """['\\n\\n"""
                        and item!="""["\\n""" and item!="""['\\n\\nQ: """ and item!= """["\\n\\n"""
                        and item!= """['Q: """ and item!= """['\\n"""]
    
    return questions



In [172]:
questions = clean_questions(questions_1d)

In [None]:
#Get answers using semantic search from textbook vector 
relevant_context = []
answers = []
for q in questions[0:650]:
    external_answer, context = get_answer(q, data)
    answers.append(external_answer)
    relevant_context.append(context)

In [206]:
#formatting the data
qa_data = []
for i, answer in enumerate(answers):
    data = {}
    data['textbook-paragraph'] = relevant_context[i]
    data['GPT-3-Semantic-Search-Generations'] = {}
    data['GPT-3-Semantic-Search-Generations']['question'] = questions[i]
    data['GPT-3-Semantic-Search-Generations']['answer'] = answer    
    qa_data.append(data)

In [208]:
with open('GPT-3_semantic_search.json', 'w', encoding='utf-8') as f:
    json.dump(qa_data, f, ensure_ascii=False, indent=4) 

In [4]:
len(qa)

454