In [1]:
import requests
from io import BytesIO
from PyPDF2 import PdfReader
import openai
import re
import json
import os
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity

In [None]:
arxiv_url = "https://arxiv.org/pdf/2110.01111.pdf"

def get_pdf_object(url):
    response = requests.get(arxiv_url)
    reader = PdfReader(BytesIO(response.content))
    return reader

def get_pdf_text(url):
    response = requests.get(arxiv_url)
    reader = PdfReader(BytesIO(response.content))
    corpus = ''
    for i in reader.pages:
        corpus += i.extract_text()
    # remove newlines
    # corpus = corpus.replace("\x03", "").replace("\n", "")
    return corpus

corpus = get_pdf_text(arxiv_url)
pdf = get_pdf_object(arxiv_url)
corpus

In [None]:
def parse_paper(pdf):
  print("Parsing paper")
  number_of_pages = len(pdf.pages)
  print(f"Total number of pages: {number_of_pages}")
  paper_text = []
  for i in range(number_of_pages):
    page = pdf.pages[i]
    page_text = []

    def visitor_body(text, cm, tm, fontDict, fontSize):
      x = tm[4]
      y = tm[5]
      # ignore header/footer
      if (y > 50 and y < 720) and (len(text.strip()) > 1):
        page_text.append({
          'fontsize': fontSize,
          'text': text.strip().replace('\x03', ''),
          'x': x,
          'y': y
        })

    _ = page.extract_text(visitor_text=visitor_body)

    blob_font_size = None
    blob_text = ''
    processed_text = []

    for t in page_text:
      if t['fontsize'] == blob_font_size:
        blob_text += f" {t['text']}"
      else:
        if blob_font_size is not None and len(blob_text) > 1:
          processed_text.append({
            'fontsize': blob_font_size,
            'text': blob_text,
            'page': i
          })
        blob_font_size = t['fontsize']
        blob_text = t['text']
    paper_text += processed_text
  return paper_text

In [None]:
paper = parse_paper(pdf)

In [12]:
def paper_df(pdf):
    filtered_pdf= []
    for row in pdf:
        if len(row['text']) < 30:
            continue
        filtered_pdf.append(row)
    df = pd.DataFrame(filtered_pdf)
    df['length'] = df['text'].apply(lambda x: len(x))
    print(df.shape)
    return df



In [None]:
df = paper_df(paper)
df.head()

In [None]:
def calculate_embeddings(df):
    embedding_model = "text-embedding-ada-002"
    embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
    df["embeddings"] = embeddings
    return df

calculate_embeddings(df)

In [None]:
def search_reviews(df, query, n=3, pprint=True):
    query_embedding = get_embedding(
        query,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))

    results = (
        df.sort_values("similarity", ascending=False, ignore_index=True)
        
    )
    return results.head(n)

In [26]:
test = pd.read_csv("embeddings.csv")
test.head(5)

Unnamed: 0.1,Unnamed: 0,fontsize,text,page,length,embeddings
0,0,1.0,Common Errors Cheat Sheet Students frequently ...,0,314,"[-0.007276198361068964, 0.03181527554988861, 0..."
1,7,1.0,Incorrect: Many red foxes live here. The noctu...,1,307,"[0.01625715009868145, -0.0023440392687916756, ..."
2,18,1.0,so that we can coexist. Incomplete and run on ...,1,375,"[0.0050130984745919704, 0.003571423003450036, ..."
3,23,1.0,Passive: The pasta was eat en by Josephine. Ac...,2,328,"[-0.004926593974232674, -0.0034326203167438507..."
4,42,1.0,it unclearly; the word could refer to article ...,2,352,"[0.0018352309707552195, 0.000923340383451432, ..."


In [24]:
# test = test.drop_duplicates(subset=['text', 'page'], keep='first')

In [25]:
test

Unnamed: 0.1,Unnamed: 0,fontsize,text,page,length,embeddings
0,0,1.0,Common Errors Cheat Sheet Students frequently ...,0,314,"[-0.007217978592962027, 0.03192491456866264, 0..."
7,7,1.0,Incorrect: Many red foxes live here. The noctu...,1,307,"[0.016169091686606407, -0.0023459333460778, -0..."
18,18,1.0,so that we can coexist. Incomplete and run on ...,1,375,"[0.005041639320552349, 0.0035730735398828983, ..."
23,23,1.0,Passive: The pasta was eat en by Josephine. Ac...,2,328,"[-0.004948729649186134, -0.0033812588080763817..."
42,42,1.0,it unclearly; the word could refer to article ...,2,352,"[0.0018352309707552195, 0.000923340383451432, ..."
49,49,1.0,He assured me that the work had been completed...,3,307,"[0.009392737410962582, -0.010591392405331135, ..."
57,57,1.0,"Impact affect, and effect Impact is a percussi...",4,317,"[-0.024574613198637962, 0.0024615300353616476,..."
77,77,1.0,affect versus effect Affect is the verb and ef...,4,323,"[-0.01929573528468609, 0.009030453860759735, 0..."
84,84,1.0,Who and that. Use who to refer to a person; us...,5,309,"[-0.00878846738487482, 0.008603863418102264, 0..."
91,91,1.0,If you join two complete sentences with a comm...,6,302,"[-0.0018168585374951363, 0.0324668250977993, 0..."


In [None]:
results = search_reviews(df, "How many swaps can the algorithm make?", n=3)
results

In [None]:
# number_of_pages = len(pdf.pages)
# paper_text = []
# for i in range(number_of_pages):
#     page = pdf.pages[i]
#     page_text = []

#     def visitor_body(text, cm, tm, fontDict, fontSize):
#         x = tm[4]
#         y = tm[5]
#         # ignore header/footer
#         if (y > 50 and y < 720) and (len(text.strip()) > 1):
#             page_text.append({
#             'fontsize': fontSize,
#             'text': text.strip().replace('\x03', ''),
#             'x': x,
#             'y': y
#             })

#     _ = page.extract_text(visitor_text=visitor_body)

#     blob_font_size = None
#     blob_text = ''
#     processed_text = []

#     for t in page_text:
#         if t['fontsize'] == blob_font_size:
#             blob_text += f" {t['text']}"
#         else:
#             if blob_font_size is not None and len(blob_text) > 1:
#                 processed_text.append({
#                     'fontsize': blob_font_size,
#                     'text': blob_text,
#                     'page': i
#                 })
#             blob_font_size = t['fontsize']
#             blob_text = t['text']
#         paper_text += processed_text
# print("Done parsing paper")

# filtered_pdf= []
# for row in paper_text:
#     if len(row['text']) < 30:
#         continue
#     filtered_pdf.append(row)
# df = pd.DataFrame(filtered_pdf)
# # print(df.head())
# df['length'] = df['text'].apply(lambda x: len(x))
# # print(df.shape)
# print('Done creating dataframe')

# openai.api_key = os.getenv('OPENAI_API_KEY')
# embedding_model = "text-embedding-ada-002"
# print('Calculating embeddings')
# print(df.text)
# embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
# df["embeddings"] = embeddings
# print('Done calculating embeddings')
# user_input = "What is GPT-3"
# query_embedding = get_embedding(
#     user_input,
#     engine="text-embedding-ada-002"
# )
# df["similarity"] = df.head().embeddings.apply(lambda x: cosine_similarity(x, query_embedding))

# results = (
#     df.sort_values("similarity", ascending=False)
    
# )

# prompt = """You are a large language model whose expertise is reading and summarizing scientific papers. 
#     You are given a query and a series of text embeddings from a paper in order of their 
#     cosine similarity to the query. You must take the given embeddings and return a very detailed summary of the paper 
#     that answers the query.
    
#     Given the query"""+ user_input + """and the following embeddings: 
    
#     1.""" + results.iloc[0] + """
#     2.""" + results.iloc[1] + """
#     3.""" + results.iloc[3] + """

#     Return a detailed answer based on the paper that answers the query."""
# print('Generating response from GPT-3')

# openai.api_key = os.getenv('OPENAI_API_KEY')
# r = openai.Completion.create(model="text-davinci-003", prompt=prompt, temperature=0.4, max_tokens=2000)
# response = r.choices[0]['text']



In [None]:
def section_headers(corpus):
    section_headers = re.findall(r'\n\d+\s+(.*?)\n', corpus)
    filtered_headers = list(filter(lambda x: x[0].isupper() and not x.isnumeric() and len(x.split())<=4, section_headers))
    return filtered_headers

headers = section_headers(corpus)
headers

In [None]:
# section_texts = re.findall(r'\n\d+\s+(.*?)\n(.*?)\n', corpus, re.DOTALL)
# section_texts

In [None]:
# regex to get paragraphs in between the section headers
paragraphs = re.findall(r'\n\d+\s+(.*?)\n(.*?)\n\d+\s+(.*?)\n', corpus, re.DOTALL)
paragraphs

In [None]:
# total number of characters in paragraphs
sum([len(i[1]) for i in paragraphs])

In [None]:
prompt = """You are a large language model whose expertise is reading and summarizing scientific papers. 
            You are given a query and a series of text embeddings from a paper in order of their 
            cosine similarity to the query. You must take the given embeddings and return a very detailed summary of the paper 
            that answers the query.
            
            Given the query"""+ query + """and the following embeddings: 
            
            1.""" + embedding1 + """
            2.""" + embedding2 + """
            3.""" + embedding3 + """

            Return a detailed answer based on the paper that answers the query."""

In [None]:
def gpt(prompt):
    openai.api_key = os.getenv('OPENAI_API_KEY')
    r = openai.Completion.create(model="text-davinci-003", prompt=prompt, temperature=0.4, max_tokens=2000)
    response = r.choices[0]['text']

    return response

In [None]:
summary = gpt(prompt)


In [None]:
def summarize(corpus):
    prompt = """Here is the abstract from a recent machine learning paper:'"""+corpus+"""'
            Summarize the above content in detail in the style of an excited professor talking about this cool paper he just read.
            Summarized content:"""
    summary = gpt(prompt)
    summary = summary.replace('\n', '')
    return summary

In [None]:
# summary = "I just read an amazing paper on a new machine learning algorithm called Dreamer V three. It's a general and scalable algorithm based on world models, and it outperforms previous approaches across a wide variety of domains. It can handle continuous and discrete actions, visual and low-dimensional inputs, two D and three D worlds, different data budgets, reward frequencies, and reward scales. Plus, it has favorable scaling properties, with larger models resulting in higher data-efficiency and performance. And here's the best part: it's the first algorithm to collect diamonds in Minecraft from scratch without human data or curricula - a long-standing challenge in artificial intelligence. This algorithm makes reinforcement learning broadly applicable and allows scaling to hard decision-making problems. It's really exciting and I can't wait to see what else it can do!"

In [None]:
def chop(text, chunk_size):
    chunks = []
    current_chunk = ""
    for word in text.split():
        if len(current_chunk) + len(word) + 1 > chunk_size:
            chunks.append(current_chunk)
            current_chunk = ""
        current_chunk += word + " "
    chunks.append(current_chunk)
    return chunks

chunks = chop(summary, 2000)

In [None]:
len(chunks[-1])

In [None]:
targets = ["https://mukuls-public-playground.s3.us-east-2.amazonaws.com/jre.mp3", "https://mukuls-public-playground.s3.us-east-2.amazonaws.com/jre2.mp3", "https://mukuls-public-playground.s3.us-east-2.amazonaws.com/jre3.mp3"]

In [None]:
def cast(text, targets):
  headers = {
    "Content-Type": "application/json",
    "Connection": "keep-alive",
    "Keep-Alive": "timeout=1000, max=100",
  }

  data = json.dumps({
    "api_key": "ff8f47e8-3643-44bc-b9d3-de51142b95fd",
    "text": text,
    "voices": "",
    "target_file": targets,
  })

  response = requests.post(
    'https://vatsalaggarwal--tts-app.modal.run',
    headers=headers,
    data=data
  )

  # Returned audio is 24kHz 32-bit PCM WAV file
  # with open('jre3.wav', 'wb') as f:
  #     f.write(response.content)
  return response.content

In [None]:
# # Concatenate the audio files

# from pydub import AudioSegment

# def concatenate_audio_files(file_names, output_file_name):
#     # Initialize an empty audio segment
#     output = AudioSegment.empty()
#     # Iterate over the file names
#     for file_name in file_names:
#         # Load the audio file
#         audio = AudioSegment.from_wav(file_name)
#         # Concatenate the audio file to the output
#         output += audio
#     # Save the output to a new file in the folder 'outputs' that is located in the same directory as this notebook
#     output.export('outputs/' + output_file_name, format='wav') 




# # Get the file names of the audio files
# file_names = ['jre.wav', 'jre2.wav', 'jre3.wav']

# concatenate_audio_files(file_names, 'jre_concat.wav')