In [2]:
from google.colab import output
%pip install -qU \
    openai==0.27.7 \
    "pinecone-client[grpc]"==2.2.1 \
    datasets==2.12.0 \
    tqdm
output.clear()

In [2]:
!pip install -U sentence-transformers
!pip install --upgrade transformers
!pip install --upgrade protobuf


from google.colab import output
output.clear()
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2') # out of the box sentence embedder from hugging face

In [3]:
import openai
key = ''
openai.api_key = key

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import gzip
import shutil
import tarfile

# Specify the path to the gzipped file and the destination path for the uncompressed file
tar_gz_file_path = '/content/drive/MyDrive/triviaqa-rc.tar.gz'
extraction_directory = '/content/trivia-qc'  # You can change this path

import os
os.makedirs(extraction_directory, exist_ok=True)

# Open and extract the .tar.gz file
with tarfile.open(tar_gz_file_path, 'r:gz') as tar:
    tar.extractall(path=extraction_directory)

# Verify that the file has been successfully uncompressed
!ls /content


In [6]:

question = 'What is the current state-of-the art in question answering?'
prompt_start = '''
{question}.'''

response = openai.ChatCompletion.create(
          model="gpt-3.5-turbo",
          messages=[{"role": "user", "content": prompt_start.format(question=question)}],
          stop=None
      )

print(question)
print(response)

What is the current state-of-the art in question answering?
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "The current state-of-the-art in question answering involves the use of advanced natural language processing and machine learning techniques. This includes methods such as deep learning, transformers, and language models. One of the most notable advancements in recent years has been the development of pre-trained language models, such as BERT (Bidirectional Encoder Representations from Transformers) and GPT (Generative Pre-trained Transformer), which have achieved impressive results in question answering tasks.\n\nThese models are trained on large amounts of text data and can understand the context and meaning of words and sentences. They can be fine-tuned for specific question answering tasks and are capable of answering a wide range of questions across different domains.\n\nAnother notable advancement is the use of n

In [5]:
## make a call to the openai API asking chatGPT to produce context from a prompt that could be answered from that context

# What prompt should we add in front of the querty?

question = 'What is the current state-of-the art in question answering?'
prompt_start = ''' Generate a background document to answer the given question in less than 200 words:
{question}.'''

response = openai.ChatCompletion.create(
          model="gpt-3.5-turbo",
          messages=[{"role": "user", "content": prompt_start.format(question=question)}],
          stop=None
      )

print(response.choices[0].message.content)

Question answering (QA) has been a focus of research in the field of natural language processing (NLP) for several years. The current state-of-the-art in QA has seen significant advancements, mainly due to the development of deep learning models and the availability of large-scale datasets.

One of the most notable advancements in QA is the introduction of transformer-based models, particularly the BERT (Bidirectional Encoder Representations from Transformers) model. BERT has revolutionized QA by pretraining a language model on massive amounts of text data, enabling it to understand the context and relationships between words. This has led to significant improvements in accuracy and performance.

Another significant development is the use of transfer learning techniques. Instead of training models from scratch, researchers have leveraged pretrained models on large-scale language tasks like language modeling, masked language modeling, and next sentence prediction. These models are then 

In [4]:
import os

# Define a function for recursive directory traversal
def collect_file_names(directory):
    file_names = []  # Initialize an empty list to store file names

    # Loop through all items (files and subdirectories) in the current directory
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)  # Get the full path of the item

        # Check if the item is a file
        if os.path.isfile(item_path):
            file_names.append(item_path)  # Add the file path to the list
        elif os.path.isdir(item_path):
            # If the item is a directory, recursively collect file names from it
            subdirectory_files = collect_file_names(item_path)
            file_names.extend(subdirectory_files)  # Extend the list with file names from the subdirectory

    return file_names

In [5]:
import json
import random
import numpy as np
import openai
import nltk
import os
from tqdm import tqdm
nltk.download('punkt')

corpus_path = '/content/trivia-qc/evidence/wikipedia'

vector_db = []
files = collect_file_names('/content/trivia-qc/evidence/wikipedia')
print('start embedding')


# ## randomly sample from the wikipedia-test-dataset
# with open('/content/trivia-qc/qa/wikipedia-train.json') as json_file:
#     data = json.load(json_file)
#     trivia_data = data['Data']

# # for all trivia questions, go to the file that is named under entitytexts and embed the text in 300 word chunks
# for json_obj in tqdm(trivia_data):
#     filenames = [json_obj['EntityPages'][i]['Filename'] for i in range(len(json_obj['EntityPages']))]
    # open file named filename under the evidence folder

batch_size = 500
file_batches = [files[i:i+batch_size] for i in range(0, len(files), batch_size)]


for files in tqdm(file_batches):
  batchchunks = []
  for filename in files:
    try:
        with open(filename) as text_file:
            # load text file
            data = text_file.read()
            words = nltk.word_tokenize(data)

            chunks = []  # To store the 500-word chunks
            current_chunk = []  # To store the current chunk

            # Maximum number of words per chunk
            max_words_per_chunk = 400

            # Iterate through the words
            for word in words:
                current_chunk.append(word)

                # Check if the current chunk has reached the maximum word count
                if len(current_chunk) >= max_words_per_chunk:
                    # Add the current chunk to the list of chunks
                    chunks.append(current_chunk)
                    # Start a new chunk
                    current_chunk = []
            # Add any remaining words as a final chunk
            if current_chunk:
                chunks.append(current_chunk)

            # Combine the chunks into strings
            chunked_texts = [' '.join(chunk) for chunk in chunks]
            batchchunks += chunked_texts
    except Exception as e:
      print(e)
      break


  # embed each chunk
  # try:
    # embeddings = openai.Embedding.create(input=batchchunks, model="text-embedding-ada-002")
  embeddings = encoder.encode(batchchunks)
  for j in range(len(embeddings)):
      # create a json object with the embedding and the source
      vector = {
          'embedding': embeddings[j].tolist(),
          'meta': {
              'sources': filename,
              'text': batchchunks[j],
          }
      }
      # append the json object to the vector_db
      vector_db.append(vector)
  # except Exception as e:
  #     print(e)
  #     break

print('done embedding')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


start embedding


100%|██████████| 149/149 [49:00<00:00, 19.74s/it]

done embedding





In [None]:
print(type(vector_db))
import pickle

# Specify the file path where you want to save the dictionary
file_path = '/content/vector_db.json'  # You can change the file path as needed

with open(file_path, 'w') as json_file:
    json.dump(vector_db, json_file)

# Verify that the dictionary has been saved
print(f"Dictionary saved to {file_path}")


<class 'list'>


In [None]:
import numpy as np

# Assuming vector_db is a list of dictionaries
# Extract the embeddings from the list and stack them vertically to form a matrix
embedding_matrix = np.vstack([vector['embedding'] for vector in vector_db])
np.save('/content/embedding_matrix.npy', embedding_matrix)

In [None]:
from tqdm import tqdm
import time
# from timeout_decorator import timeout

accuracyq = 0
accuracyfc = 0
total = len(trivia_data)

# Function to perform operations on a question
def process_question(question):
    ## ask chatgpt the question using the openai api
    prompt_start = ''' Generate a background document to answer the given question in less than 200 words:
    {question}.'''
    try:
      response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt_start.format(question=question)}],
            stop=None
        )
    except:
      return None
    return response.choices[0].message.content


print('starting query boosting')
# Iterate through the list of train data jsons
for json_obj in tqdm(trivia_data):
    # Extract the question from the JSON object
    question = json_obj["Question"]
    question_sources = [json_obj["EntityPages"][i]['Filename'] for i in range(len(json_obj["EntityPages"]))]


    # Perform operations on the question
    fake_context = process_question(question)

    if fake_context is None:
      continue

    ## embed the fake context using the openai api embeddings endpoint

    try:
      embeddings = openai.Embedding.create(input=[fake_context, question], model="text-embedding-ada-002")
      fake_context_embedding = embeddings.data[0].embedding
      question_embedding = embeddings.data[1].embedding
    except:
      continue


    # search our vector_db

    bestq = None
    bestfc = None

    resfc = np.dot(embedding_matrix, fake_context_embedding)
    max_indexfc = np.argmax(resfc)

    resq = np.dot(embedding_matrix, question_embedding)
    max_indexq = np.argmax(resq)

    # Check if the processed question matches the question source

    bestq = vector_db[max_indexq]
    bestfc = vector_db[max_indexfc]


    # Check if the processed question matches the question source
    if bestq['meta']['sources'] in question_sources:
        accuracyq += 1
    if bestfc['meta']['sources'] in question_sources:
        accuracyfc += 1

print('ACCURACY BASE: ', accuracyq/total)
print('ACCURACY BOOSTED: ', accuracyfc/total)

