-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
139 lines (94 loc) · 3.61 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from dotenv import load_dotenv
import os
import argparse
from docx import Document
import pdfplumber
from textwrap import wrap
import json
import re
import requests
import pinecone
import openai
import numpy as np
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = openai_api_key
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pinecone.init(api_key=pinecone_api_key, environment="us-east1-gcp")
def create_embedding(text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
def read_txt(file_path):
with open(file_path, 'r') as file:
return file.read()
def read_doc(file_path):
doc = Document(file_path)
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
def read_pdf(file_path):
with pdfplumber.open(file_path) as pdf:
text = ''
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def main():
parser = argparse.ArgumentParser(description='Read text from different file formats')
parser.add_argument('file', help='Path to the file to read text from')
args = parser.parse_args()
input_folder = "working"
file_path = os.path.join(input_folder, args.file)
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.txt':
text = read_txt(file_path)
elif file_ext == '.doc' or file_ext == '.docx':
text = read_doc(file_path)
elif file_ext == '.pdf':
text = read_pdf(file_path)
else:
print(f"Error: The file format {file_ext} is not supported.")
return
chunks = wrap(text, 2000)
embeddings = []
for chunk in chunks:
embedding = create_embedding(chunk)
embeddings.append(embedding)
index_name = "document-chunks"
pinecone.init(api_key=pinecone_api_key, environment="us-east1-gcp")
if index_name not in pinecone.list_indexes():
embeddings = np.array(embeddings)
dimension = embeddings[0].shape[0]
pinecone.create_index(index_name, metric="cosine", dimension=dimension)
index = pinecone.Index('document-chunks')
upserts = [(f"chunk-{i}", embedding) for i, embedding in enumerate(embeddings)]
index.upsert(vectors=upserts)
# Prompt user for a question
question = input("Enter your question: ")
# Search for the nearest chunk and get its ID
nearest_chunk_id = search(question)
# Get the chunk index from the ID
chunk_index = int(nearest_chunk_id.split("-")[-1])
# Retrieve the corresponding text chunk from the 'chunks' list
nearest_chunk_text = chunks[chunk_index]
# Use GPT-3 to answer the question based on the retrieved chunk
prompt = f"The following text contains the information you are looking for:\n{nearest_chunk_text}\n\nQuestion: {question}\nAnswer:"
answer = gpt3_completion(prompt)
# Print the answer
print("Answer:", answer)
def search(query, index_name="document-chunks"):
embedding = create_embedding(query)
pinecone.init(api_key=pinecone_api_key, environment="us-east1-gcp")
index = pinecone.Index(index_name)
results = index.query(queries=[embedding], top_k=1)
nearest_chunk_id = results["results"][0]["matches"][0]["id"]
return nearest_chunk_id
def gpt3_completion(prompt):
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": prompt}
]
)
return completion.choices[0].message
if __name__ == "__main__":
main()