In [151]:
# Define environment variables directly in the notebook
import os

os.environ['AWS_ACCESS_KEY'] = 'YOUR_AWS_ACCESS_KEY'
os.environ['AWS_SECRET_KEY'] = 'YOUR_AWS_SECRET_KEY'
os.environ['S3_ENDPOINT'] = 'YOUR_S3_ENDPOINT'
os.environ['VECTARA_CUSTOMER_ID'] = 'YOUR_VECTARA_CUSTOMER_ID'
os.environ['VECTARA_API_KEY'] = 'YOUR_VECTARA_API_KEY'
os.environ['VECTARA_CORPUS_ID'] = 'YOUR_VECTARA_CORPUS_ID'
os.environ['OPENAI_API_KEY'] = 'YOUR_OPENAI_API_KEY'

In [2]:
# Install PyPDF2 if it's not already installed
!pip install PyPDF2

# Import necessary libraries
import os
import boto3
from botocore.client import Config
import urllib3
from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Vectara
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import json
from io import BytesIO
from PyPDF2 import PdfReader

# Disable SSL warnings
urllib3.disable_warnings()



In [3]:
# Set up AWS S3 and vectara credentials and endpoint
access_key = os.getenv('AWS_ACCESS_KEY')
secret_key = os.getenv('AWS_SECRET_KEY')
endpoint = os.getenv('S3_ENDPOINT')

# Set up Vectara credentials
vectara_customer_id = os.getenv("VECTARA_CUSTOMER_ID")
vectara_api_key = os.getenv("VECTARA_API_KEY")
vectara_corpus_id = os.getenv("VECTARA_CORPUS_ID")


In [4]:
# Define S3ObjectLoader class
class S3ObjectLoader:
    def __init__(self, bucket_name, access_key, secret_key, endpoint):
        self.bucket_name = bucket_name
        self.s3_client = boto3.client(
            's3',
            aws_access_key_id=access_key,
            aws_secret_access_key=secret_key,
            endpoint_url=endpoint,
            verify=False,  # Disable SSL verification
            config=Config(signature_version='s3v4')
        )

    def load(self):
        try:
            response = self.s3_client.list_objects_v2(Bucket=self.bucket_name)
            documents = []

            for obj in response.get('Contents', []):
                file_key = obj['Key']
                obj_data = self.s3_client.get_object(Bucket=self.bucket_name, Key=file_key)
                file_content = self._read_content(file_key, obj_data['Body'].read())
                documents.append((file_key, file_content))

            return documents
        except Exception as e:
            raise RuntimeError(f"Failed to load documents from S3: {e}")

    def _read_content(self, file_key, content):
        if file_key.endswith('.txt'):
            return content.decode('utf-8', 'ignore')
        elif file_key.endswith('.json'):
            return json.dumps(json.loads(content), indent=4)
        elif file_key.endswith('.pdf'):
            reader = PdfReader(BytesIO(content))
            text_lines = []

            for page in reader.pages:
                text = page.extract_text()
                lines = text.split('\n')
                text_lines.extend(lines)

                if len(text_lines) >= 20:
                    break

            return '\n'.join(text_lines[:20])
        else:
            return f"Unsupported file type for {file_key}"

In [5]:
# Example usage
bucket_name = 'YOUR_BUCKET_NAME'

loader = S3ObjectLoader(bucket_name, access_key, secret_key, endpoint)

try:
    documents = loader.load()
    for file_key, content in documents:
        print(f"File: {file_key}\nContent (first 20 lines):")
        print(content)  # Print the limited content
        print('\n---\n')
except Exception as e:
    print(f"Failed to load documents: {e}")
    raise

File: data-00000-of-00001.arrow
Content (first 20 lines):
Unsupported file type for data-00000-of-00001.arrow

---

File: dataset_info.json
Content (first 20 lines):
{
    "builder_name": "parquet",
    "citation": "",
    "config_name": "fr-en",
    "dataset_name": "wmt14",
    "dataset_size": 14754105540,
    "description": "",
    "download_checksums": {
        "hf://datasets/wmt14@b199e406369ec1b7634206d3ded5ba45de2fe696/fr-en/train-00000-of-00030.parquet": {
            "num_bytes": 252421180,
            "checksum": null
        },
        "hf://datasets/wmt14@b199e406369ec1b7634206d3ded5ba45de2fe696/fr-en/train-00001-of-00030.parquet": {
            "num_bytes": 240967470,
            "checksum": null
        },
        "hf://datasets/wmt14@b199e406369ec1b7634206d3ded5ba45de2fe696/fr-en/train-00002-of-00030.parquet": {
            "num_bytes": 243304904,
            "checksum": null
        },
        "hf://datasets/wmt14@b199e406369ec1b7634206d3ded5ba45de2fe696/fr-en/train-000

In [6]:
# Initialize ChatOpenAI
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.0,
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

In [7]:
# Assuming 'documents' is a list of tuples where each tuple contains (file_key, content)
# Convert 'documents' to a list of Document objects
document_objects = [Document(page_content=content) for file_key, content in documents]


# Create Vectara vector store from documents
try:
    vectara = Vectara.from_documents(document_objects, embedding=None, vectara_customer_id=vectara_customer_id, 
                                     vectara_api_key=vectara_api_key, vectara_corpus_id=vectara_corpus_id)
    retriever = vectara.as_retriever()
    print("Retriever created successfully.")
except Exception as e:
    print(f"Failed to create Vectara vector store or retriever: {e}")
    raise

Retriever created successfully.


In [8]:
# Set up memory and ConversationalRetrievalChain
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
bot = ConversationalRetrievalChain.from_llm(llm, retriever, memory=memory, verbose=False)

In [None]:
# Invoke the bot and get the answer
result = bot.invoke({"question": "YOUR_QUESTION"})
print(result["answer"])

In [11]:
# Invoke the bot and get the answer
result = bot.invoke({"question": "What is the name of the game played by wizards on flying broomsticks in the Harry Potter series?"})
print(result["answer"])

result = bot.invoke({"question": "Qui est l'auteur de Harry Potter"})
print(result["answer"])

result = bot.invoke({"question": "Iq level of monkey"})
print(result["answer"])
result = bot.invoke({"question": "Niveau de QI du singe in english"})
print(result["answer"])
result = bot.invoke({"question": "What is the annual revenue of Paris's fashion industry"})
print(result["answer"])
result = bot.invoke({"question": "What is the annual revenue of Paris's fashion industry in english"})
print(result["answer"])
result = bot.invoke({"question": "Quelle est la hauteur de la Tour Eiffel ?"})
print(result["answer"])
result = bot.invoke({"question": "what is the height of Eiffel Tower?"})
print(result["answer"])

The game played by wizards on flying broomsticks in the Harry Potter series is called Quidditch.
L'auteur de la série Harry Potter est J.K. Rowling.
The IQ level of monkeys is 174.
The IQ level of monkeys is 174.
The annual revenue of Paris's fashion industry exceeds 150 billion euros.
The annual revenue of Paris's fashion industry is over 150 billion euros.
La Tour Eiffel mesure 324 mètres de hauteur.
The Eiffel Tower is 324 meters tall.
