In [1]:
#@title Overview

#@markdown The main purpose of this code is to allow users to query a large body of text relevant to their project easily. In the future, a visualization tool may also be combined to allow users to visualize their knowledge space in two dimensions and reveal potential connections and such.

In [2]:
#@title User-Defined Variables
search_query = "How do you set the PID parameters in constant deflection OBD AFM scanning?" #@param
chapter_mapper_folder = "/content/drive/MyDrive/chapter-mapper" #@param
pdf_folder = "fangzhou-xia" #@param
chunk_size = 256 #@param
chunk_overlap = 0.1 #@param
number_results = 3 #@param
openai_api_key = "" #@param
save_name_suffix = "[with-embeddings]" #@param
COMPLETIONS_MODEL = "gpt-3.5-turbo" #@param
EMBEDDING_MODEL = "text-embedding-ada-002"#@param

#@markdown ---
#@markdown ### **Help**
#@markdown - **`search_query`** - *`str; The question or information you would like to search for that is related to the text contained in pdf_folder.`*
#@markdown - **`chapter_mapper_folder`** - *`str; The name of the chapter-mapper root folder.`*
#@markdown - **`pdf_folder`** - *`str; The name of the folder which contains all the PDF's you want to analyze.`*
#@markdown - **`chunk_size`** - *`int; How many words you want each chunk of text to contain (approximately). Default is 256.`*
#@markdown - **`chunk_overlap`** - *`float; How much overlap you want each chunk of text to have with the next chunk. Default is 0.1.`*
#@markdown - **`number_results`** - *`int; How many search results you want to see in the plot. Default is 10.`*
#@markdown - **`openai_api_key`** - *`str; Your (free trial) OpenAI API Key`*
#@markdown - **`save_name_suffix`** - *`str; string to add to end of csv file for saving purposes. Default is '[with-embeddings].csv'`*
#@markdown - **`COMPLETIONS_MODEL`** - *`str; The name of the model to use for answering prompts. Default is 'text-davinci-003' from OpenAI.`*
#@markdown - **`EMBEDDING_MODEL`** - *`str; The name of the model to use for obtaining text embeddings. Default is 'text-embedding-ada-002' from OpenAI.`*

In [3]:
#@title Mount Drive
from google.colab import drive, files
from IPython.display import HTML
import os
import sys
import shutil
import subprocess
output = subprocess.run(["pip", "list"], capture_output=True)
default_packages = output.stdout.decode().strip().split("\n")
drive.mount('/content/drive')
!mkdir /content/drive/MyDrive/python-packages
%cd '/content/drive/MyDrive/python-packages'
sys.path.append('/content/drive/MyDrive/python-packages')

Mounted at /content/drive
mkdir: cannot create directory ‘/content/drive/MyDrive/python-packages’: File exists
/content/drive/MyDrive/python-packages


In [4]:
#@title Install Required Packages
try:
  import openai
except:
  subprocess.run(["pip","install","openai"])
  output = subprocess.run(["pip", "list"], capture_output=True)
  new_packages = output.stdout.decode().strip().split("\n")
  subprocess.run(['pip', 'freeze'], stdout=open(f"{chapter_mapper_folder}/requirements.txt", 'w'))
  non_common_elements_list2 = list(set(new_packages) - set(default_packages))
  non_common_elements = non_common_elements_list2
  for element in non_common_elements:
    element = element.split()[0]+"=="+element.split()[1]
    subprocess.run(["pip", "install",element,"--no-deps",f"--target=/content/drive/MyDrive/python-packages"])
  import openai
try:
  import fitz
except:
  subprocess.run(["pip","install","pymupdf"])
  output = subprocess.run(["pip", "list"], capture_output=True)
  new_packages = output.stdout.decode().strip().split("\n")
  subprocess.run(['pip', 'freeze'], stdout=open(f"{chapter_mapper_folder}/requirements.txt", 'w'))
  non_common_elements_list2 = list(set(new_packages) - set(default_packages))
  non_common_elements = non_common_elements_list2
  for element in non_common_elements:
    element = element.split()[0]+"=="+element.split()[1]
    subprocess.run(["pip", "install",element,"--no-deps",f"--target=/content/drive/MyDrive/python-packages"])
  import fitz
try:
  import tiktoken
except:
  subprocess.run(["pip","install","tiktoken"])
  output = subprocess.run(["pip", "list"], capture_output=True)
  new_packages = output.stdout.decode().strip().split("\n")
  subprocess.run(['pip', 'freeze'], stdout=open(f"{chapter_mapper_folder}/requirements.txt", 'w'))
  non_common_elements_list1 = list(set(default_packages) - set(new_packages))
  non_common_elements_list2 = list(set(new_packages) - set(default_packages))
  non_common_elements = non_common_elements_list1 + non_common_elements_list2
  for element in non_common_elements:
    element = element.split()[0]+"=="+element.split()[1]
    subprocess.run(["pip", "install",element,"--no-deps","--target=/content/drive/MyDrive/python-packages"])
  import tiktoken
try:
  import fpdf
except:
  subprocess.run(["pip","install","fpdf"])
  output = subprocess.run(["pip", "list"], capture_output=True)
  new_packages = output.stdout.decode().strip().split("\n")
  subprocess.run(['pip', 'freeze'], stdout=open(f"{chapter_mapper_folder}/requirements.txt", 'w'))
  non_common_elements_list1 = list(set(default_packages) - set(new_packages))
  non_common_elements_list2 = list(set(new_packages) - set(default_packages))
  non_common_elements = non_common_elements_list1 + non_common_elements_list2
  for element in non_common_elements:
    element = element.split()[0]+"=="+element.split()[1]
    subprocess.run(["pip", "install",element,"--no-deps","--target=/content/drive/MyDrive/python-packages"])
  import fpdf

subprocess.run(['pip', 'freeze'], stdout=open(f"{chapter_mapper_folder}/requirements.txt", 'w'))

CompletedProcess(args=['pip', 'freeze'], returncode=0)

In [5]:
#@title Change Working Directory to `chapter_mapper_folder`
%cd {chapter_mapper_folder}

/content/drive/MyDrive/chapter-mapper


In [6]:
#@title Import Dependencies
import numpy as np
import colorsys
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import fpdf
from utils import *
from sklearn.manifold import TSNE
from openai.embeddings_utils import get_embedding, cosine_similarity
from typing import List, Dict, Tuple
from tqdm import tqdm
from IPython.display import Markdown
import pickle
openai.api_key = openai_api_key

In [7]:
#@title Collect Data from all PDF's in `pdf_folder`
df_init, file_prefix = collect_pdf_folder_data(pdf_folder,chunk_size,chunk_overlap)













Loading raw data from fangzhou-xia/fangzhou-xia-cs=256-co=0.10-raw-data.csv...


In [8]:
#@title Create Embeddings for the Raw Data
df_init = get_batched_embeddings(df_init, pdf_folder, file_prefix, save_name_suffix, openai_api_key)

In [9]:
#@title Obtain TSNE Matrix Data for 2D Map Visualization
df, dm, all_titles = get_tsne_plot_params(df_init,pdf_folder,file_prefix,save_name_suffix)

In [10]:
#@title Plot Results and Answer Question based on Text Data
plotmap(df,search_query,number_results,dm,all_titles,pdf_folder,EMBEDDING_MODEL)

Output hidden; open in https://colab.research.google.com to view.

In [11]:
#@title Answer the Question Based on Relevant Text
if len(search_query) > 0:
  test = answer_query_with_context(search_query, df, COMPLETIONS_MODEL=COMPLETIONS_MODEL, EMBEDDING_MODEL=EMBEDDING_MODEL)

Markdown("<br>**ChatGPT Response**:<br>" + test)

Converting string embeddings to a list of floats...
Selected 3 document sections:
p. 41, fangzhou-xia/Xia et al. - Active Probe Atomic Force Microscopy.pdf
p. 281, fangzhou-xia/Xia et al. - Active Probe Atomic Force Microscopy.pdf
p. 281, fangzhou-xia/Xia et al. - Active Probe Atomic Force Microscopy.pdf


<br>**ChatGPT Response**:<br>AFM users typically follow empirical rules to adjust the controller parameters, such as increasing the proportional gain until unstable oscillation caused by noise signal is observed and then reducing it slightly. The integral controller is then used to remove the steady state error while ensuring stability, and the derivative controller helps to improve transient tracking. During AFM imaging, each single line is typically scanned twice from two directions, which are called trace and retrace. The topography obtained in trace and retrace should ideally be identical, and the user manually adjusts the controller parameter to minimize the difference between the trace and retrace scans. [p. 41, fangzhou-xia/Xia et al. - Active Probe Atomic Force Microscopy.pdf]

In [12]:
#@title Save the Plot as an HTML File
file_out = f"{pdf_folder}-{search_query}"
go.Figure.write_html(fig,f"{file_out}.html") # write as html or image
files.download(f"{file_out}.html") # download your file and give me a vote my answer

NameError: ignored