Skip to content

Commit

Permalink
Merge pull request #60 from michaelthwan/semantic_openai_native
Browse files Browse the repository at this point in the history
Semantic openai native (de-faiss) / Model-based citation footnote
  • Loading branch information
michaelthwan committed Mar 9, 2023
2 parents eda372b + 34e22e1 commit afab651
Show file tree
Hide file tree
Showing 15 changed files with 449 additions and 377 deletions.
103 changes: 103 additions & 0 deletions playground/test_OpenAI_Embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os

import openai
import pandas as pd
import yaml
from openai.embeddings_utils import get_embedding, cosine_similarity

from Util import get_project_root

BASE_MODEL = "text-embedding-ada-002" # default embedding of faiss-openai


def search_using_cosine_similarity(df, query):
query_embedding = get_embedding(query, engine=BASE_MODEL)
df["similarity"] = df['embeddings'].apply(lambda x: cosine_similarity(x, query_embedding))

results = df.sort_values("similarity", ascending=False, ignore_index=True)

k = 5
results = results.head(k)
global sources
sources = []
for i in range(k):
sources.append({'Page ' + str(results.iloc[i]['page']): results.iloc[i]['text'][:150] + '...'})
print(sources)
return results.head(k)


def compute_embeddings(text, model="text-embedding-ada-002"):
print(f'compute_embeddings() text: {text}')
text = text.replace("\n", " ")
return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']


def search_similar(df: pd.DataFrame, target_text, n=3, pprint=True):
print(f'search_similar() text: {target_text}')
embedding = compute_embeddings(target_text, model=BASE_MODEL)
df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, embedding))
res = df.sort_values('similarities', ascending=False).head(n)
return res, df


def compute_embeddings_2(text_df, model=BASE_MODEL, chunk_size=1000):
print(f'compute_embeddings_2() len(texts): {len(df)}')
text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " "))
embeddings = []
for i in range(0, len(texts), chunk_size):
response = openai.Embedding.create(
input=texts[i: i + chunk_size], engine=model
)
embeddings += [r["embedding"] for r in response["data"]]
text_df['embedding'] = embeddings
return text_df


if __name__ == '__main__':
# text_df = pd.read_csv(os.path.join(get_project_root(), 'src/text_df.csv'))
texts = [
"Discover the world of delicious beans with our premium selection.",
"Try our savory bean soup recipe for a delicious and nutritious meal.",
"Our roasted coffee beans are carefully selected for their rich and delicious flavor.",
"Beans are not only delicious, but also a great source of protein and dietary fiber.",
"Looking for a delicious vegan meal? Try our spicy black bean burger recipe.",

"The sky is blue and the sun is shining today.",
"I need to go grocery shopping after work to pick up some milk and bread.",
"Did you hear about the new movie that just came out? It's supposed to be really good.",
"I'm planning a trip to Europe next summer and I'm so excited.",
"My cat keeps meowing at me for no reason and it's driving me crazy.",
]
text_df = pd.DataFrame({'text': texts, 'docno': range(len(texts))})
print(text_df.shape)

with open(os.path.join(get_project_root(), 'src/config/config.yaml')) as f:
config = yaml.load(f, Loader=yaml.FullLoader)
openai.api_key = config.get('openai_api').get('api_key')

# text_df = compute_embeddings(text_df)
# result_df = search_using_cosine_similarity(text_df, 'what is chatgpt?')
# print(result_df)

search_text = 'delicious beans'
search_text = 'Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection '

from pyinstrument import Profiler

profiler = Profiler()
profiler.start()
print("Sequential call mode:")
text_df['embedding'] = text_df['text'].apply(lambda x: compute_embeddings(x, model=BASE_MODEL))
res, text_df = search_similar(text_df, search_text, n=3)
print(res)
profiler.stop()
profiler.print()

profiler = Profiler()
profiler.start()
print("Batch call mode:")
text_df = compute_embeddings_2(text_df)
res, text_df = search_similar(text_df, search_text, n=3)
print(res)
profiler.stop()
profiler.print()
14 changes: 5 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ Flask==2.2.3
requests==2.28.2
gunicorn==20.1.0

# nlp
# embedding
matplotlib==3.7.1
plotly==5.13.1
scipy==1.10.1
scikit-learn==1.2.1

# doc extraction
python-docx==0.8.11
Expand All @@ -18,11 +22,3 @@ python-pptx==0.6.21
# html extraction
beautifulsoup4==4.11.2
trafilatura==1.4.1

# doc indexer1
# python-terrier==0.9.2

# doc indexer2
faiss-cpu==1.7.3
langchain==0.0.95

154 changes: 0 additions & 154 deletions src/FootnoteService.py

This file was deleted.

Loading

0 comments on commit afab651

Please sign in to comment.