In [None]:
# Standard library for interacting with the operating system, like getting environment variables.
import os
# Used to load environment variables from a .env file into os.environ.
from dotenv import load_dotenv
# Provides access to system-specific parameters and functions, like the Python interpreter path.
import sys
# For generating universally unique identifiers, used for Qdrant point IDs.
import uuid

# Print the path to the Python interpreter being used by this notebook. Useful for debugging environment issues.
print(sys.executable)

# Import TensorFlow and print its version. TensorFlow is a machine learning framework.
# It's a dependency for sentence-transformers, even if PyTorch is the primary backend.
import tensorflow
print(tensorflow.__version__)

# Import Keras and print its version. Keras is a high-level API for building and training neural networks.
# It can run on top of TensorFlow.
import keras
print(keras.__version__)

# Import tf_keras (a Keras 2 API shim for TensorFlow) and print its version.
# This helps with compatibility between Keras 3 and libraries expecting Keras 2.
import tf_keras
print(tf_keras.__version__)

# Load variables from a .env file in the current directory into environment variables.
# This is typically used for sensitive information like API keys.
load_dotenv()
# Retrieve the OpenAI API key from the environment variables.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Check if the OpenAI API key was successfully loaded.
if not OPENAI_API_KEY:
    # If not found, print a warning message with instructions.
    print("WARNING: OPENAI_API_KEY not found in environment variables or .env file.")
    print("CrewAI agents (Member B's part) will not function correctly without it.")
    print("Please create a .env file in the root directory with: OPENAI_API_KEY='sk-your_key_here'")
else:
    # If found, confirm it was loaded.
    print("OPENAI_API_KEY loaded successfully (from environment or .env file).")


# Cell 2: Data Ingestion & NLTK Setup Imports

# Pandas for data manipulation and analysis, especially for working with CSV files and DataFrames.
import pandas as pd
# Natural Language Toolkit for text processing tasks like tokenization.
import nltk
# Regular expression operations for text pattern matching and manipulation.
import re
# BeautifulSoup for parsing HTML and XML documents, used here to clean text.
from bs4 import BeautifulSoup
# Datetime module for working with dates and times, used for timestamping.
from datetime import datetime
# JSON library for working with JSON data.
import json
# os was already imported but often re-listed if a cell is standalone.

# Function to download NLTK resources if they are not already present.
def download_nltk_resource(resource_name, resource_path):
    try:
        # Try to find the NLTK resource (e.g., 'tokenizers/punkt').
        nltk.data.find(resource_path)
        print(f"NLTK '{resource_name}' resource found.")
    except LookupError:
        # If not found, download it.
        print(f"NLTK '{resource_name}' resource not found. Downloading...")
        nltk.download(resource_name, quiet=True) # quiet=True suppresses verbose download output.
        print(f"NLTK '{resource_name}' downloaded.")
    except Exception as e:
        # Handle any other errors during the download process.
        print(f"Error checking/downloading NLTK '{resource_name}': {e}")

# Download 'punkt' tokenizer models (used for sentence tokenization).
download_nltk_resource('punkt', 'tokenizers/punkt')
# Download 'punkt_tab' (additional data for punkt, often for specific languages/cases, though not always strictly needed).
download_nltk_resource('punkt_tab', 'tokenizers/punkt_tab')


# Cell 3: Load Dataset

# Define the path to the dataset CSV file.
dataset_path = "tripadvisor_hotel_reviews.csv"
# Initialize an empty Pandas DataFrame. This will be populated if the file is found.
df = pd.DataFrame() 

# Check if the dataset file exists at the specified path.
if not os.path.exists(dataset_path):
    # If not, print an error and instructions on how to get it.
    print(f"ERROR: Dataset file not found at {dataset_path}")
    print("Please download it from Kaggle (e.g., https://www.kaggle.com/datasets/andrewmvd/trip-advisor-hotel-reviews) and place it in the same directory as this notebook.")
else:
    # If the file exists, try to read it into a Pandas DataFrame.
    try:
        df = pd.read_csv(dataset_path)
        print(f"Dataset '{dataset_path}' loaded successfully. Shape: {df.shape}")
        # If the DataFrame is not empty after loading:
        if not df.empty:
          print("Dataset Info:")
          df.info() # Display a concise summary of the DataFrame (column types, non-null values).
          print("\nFirst 5 rows of the dataset:")
          print(df.head()) # Display the first 5 rows.
    except Exception as e:
        # Handle any errors during file loading.
        print(f"Error loading dataset '{dataset_path}': {e}")


# Cell 4: Preprocessing - Clean and Chunk Text

# Function to clean text data.
def clean_text(text):
    # Ensure the input is a string. If not, return an empty string.
    if not isinstance(text, str):
        return "" 
    # Use BeautifulSoup to remove HTML tags.
    text = BeautifulSoup(text, "html.parser").get_text()  
    # Remove characters that are not alphanumeric, whitespace, or basic punctuation.
    # Double backslashes are needed for regex in Python strings, especially for '\s', '\'', '\"'.
    text = re.sub(r'[^A-Za-z0-9\\s,.!?\\\'\"]', '', text)
    # Replace multiple whitespace characters with a single space and strip leading/trailing whitespace.
    text = re.sub(r'\\s+', ' ', text).strip()
    return text

# Check if the DataFrame is not empty and contains the 'Review' column.
if not df.empty and 'Review' in df.columns:
    print("\nCleaning reviews...")
    # Apply the clean_text function to each review in the 'Review' column.
    df['cleaned_review'] = df['Review'].apply(clean_text)
    print("Cleaned reviews (sample of original vs cleaned):")
    # Show a sample of original vs. cleaned reviews.
    print(df[['Review', 'cleaned_review']].head())
elif df.empty:
    print("DataFrame is empty, skipping review cleaning.")
else:
    print("Column 'Review' not found in DataFrame, skipping review cleaning.")

# Function to chunk text into smaller pieces, respecting a maximum token limit.
def chunk_text(text, max_tokens=450):
    # If text is not a string or is empty/whitespace only, return an empty list.
    if not isinstance(text, str) or not text.strip():
        return [] 
    
    # Tokenize the text into sentences using NLTK.
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk_sentences = [] # Accumulates sentences for the current chunk.
    current_token_count = 0      # Tracks token count for the current chunk.
    
    for sentence in sentences:
        # Tokenize the current sentence into words to count tokens.
        sentence_tokens = nltk.word_tokenize(sentence) 
        token_count_for_sentence = len(sentence_tokens)
        
        # If a single sentence is already larger than max_tokens, split it.
        if token_count_for_sentence > max_tokens: 
            # If there's an existing chunk being built, finalize it.
            if current_chunk_sentences: 
                chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = []
                current_token_count = 0
            
            # Split the oversized sentence into sub-chunks of max_tokens.
            start = 0
            while start < token_count_for_sentence:
                sub_sentence_tokens = sentence_tokens[start : start + max_tokens]
                chunks.append(" ".join(sub_sentence_tokens)) 
                start += max_tokens
            continue # Move to the next sentence.

        # If adding the current sentence doesn't exceed max_tokens for the current chunk:
        if current_token_count + token_count_for_sentence <= max_tokens:
            current_chunk_sentences.append(sentence)
            current_token_count += token_count_for_sentence
        # If it would exceed max_tokens:
        else: 
            # Finalize the current chunk if it has content.
            if current_chunk_sentences: 
                chunks.append(" ".join(current_chunk_sentences))
            # Start a new chunk with the current sentence.
            current_chunk_sentences = [sentence] 
            current_token_count = token_count_for_sentence
            
    # Add any remaining sentences in the last chunk.
    if current_chunk_sentences: 
        chunks.append(" ".join(current_chunk_sentences))
        
    # Return only non-empty chunks.
    return [chunk for chunk in chunks if chunk.strip()]


# Cell 5: Create Documents with Metadata & Save

# List to store the processed documents for indexing.
documents_for_indexing = []
# Ensure DataFrame has necessary columns.
if not df.empty and 'cleaned_review' in df.columns and 'Rating' in df.columns:
    print("\nCreating document chunks with metadata...")
    # Iterate over each row in the DataFrame.
    for idx, row in df.iterrows():
        cleaned_review_text = row['cleaned_review']
        original_review_identifier = idx # Use DataFrame index as an identifier for the original review.
        try:
            # Attempt to convert rating to an integer.
            rating_val = int(row["Rating"])
        except (ValueError, TypeError):
            # If conversion fails, default to 0 or handle as appropriate.
            rating_val = 0 

        # Chunk the cleaned review text.
        chunks = chunk_text(cleaned_review_text)
        for i, chunk_content in enumerate(chunks):
            # Skip empty chunks.
            if not chunk_content.strip(): continue
            
            # Create a unique ID for each chunk using UUID.
            chunk_unique_id = str(uuid.uuid4()) 
            
            # Append the chunk and its metadata to the list.
            documents_for_indexing.append({
                "id": chunk_unique_id, 
                "content": chunk_content,
                "metadata": {
                    "original_review_id": original_review_identifier, 
                    "source": f"tripadvisor_review_{original_review_identifier}", # Source identifier.
                    "chunk_sequential_id_in_review": i, # Order of this chunk within the original review.
                    "rating": rating_val, # Associated rating.
                    "timestamp_processed": datetime.now().isoformat(), # Processing timestamp.
                    "category": "hotel_review" # Document category.
                }
            })
    print(f"Created {len(documents_for_indexing)} document chunks for indexing.")
    # If documents were created, show an example and save them.
    if documents_for_indexing:
        print("Example document structure:")
        # Print the first document as a formatted JSON string.
        print(json.dumps(documents_for_indexing[0], indent=2))

        processed_json_path = "processed_docs.json"
        processed_csv_path = "processed_docs.csv"
        
        print(f"\nSaving processed documents to {processed_json_path} and {processed_csv_path}...")
        try:
            # Save as JSON.
            with open(processed_json_path, "w", encoding="utf-8") as f:
                json.dump(documents_for_indexing, f, indent=2)
            print(f"Successfully saved to {processed_json_path}")

            # Prepare data for CSV saving (flattening metadata).
            df_to_save_data = []
            for doc in documents_for_indexing:
                # Combine 'id', 'content', and all metadata fields into a flat dictionary.
                flat_doc = {"id": doc["id"],"content": doc["content"],**doc["metadata"]}
                df_to_save_data.append(flat_doc)
            # Save as CSV.
            pd.DataFrame(df_to_save_data).to_csv(processed_csv_path, index=False, encoding="utf-8")
            print(f"Successfully saved to {processed_csv_path}")
        except Exception as e:
            print(f"Error saving processed documents: {e}")
else:
    print("\nSkipping document creation: DataFrame is empty or required columns ('cleaned_review', 'Rating') are missing.")
    documents_for_indexing = [] # Ensure it's an empty list if skipped.


# Cell 6: Vector Indexing & Retrieval Setup

# For creating text embeddings.
from sentence_transformers import SentenceTransformer
# Qdrant client for vector database operations.
from qdrant_client import QdrantClient, models as qdrant_models 

# Name for the Qdrant collection.
qdrant_collection_name = "travel_guide_rag_collection_v2"
# Initialize Qdrant client and embedding model instances as None.
qdrant_client_instance = None 
embedding_model_instance = None 

# Proceed only if there are documents to index.
if documents_for_indexing:
    print("\nSetting up vector indexing...")
    # Name of the sentence transformer model to use for embeddings.
    model_name_for_embedding = "all-MiniLM-L6-v2"
    try:
        # Load the pre-trained sentence transformer model.
        embedding_model_instance = SentenceTransformer(model_name_for_embedding)
        print(f"Embedding model '{model_name_for_embedding}' loaded.")

        # Initialize Qdrant client to run in-memory.
        qdrant_client_instance = QdrantClient(":memory:") 
        print("Qdrant client initialized (in-memory).")

        # Get the dimensionality of the embeddings produced by the model.
        vector_size = embedding_model_instance.get_sentence_embedding_dimension()
        print(f"Detected vector size for embeddings: {vector_size}")

        # Recreate the Qdrant collection (deletes if exists, then creates).
        # This ensures a fresh collection for each run.
        qdrant_client_instance.recreate_collection(
            collection_name=qdrant_collection_name,
            vectors_config=qdrant_models.VectorParams(size=vector_size, distance=qdrant_models.Distance.COSINE) # Using Cosine similarity.
        )
        print(f"Qdrant collection '{qdrant_collection_name}' created/recreated.")
    
        # Extract the content from each document for embedding.
        content_list = [doc["content"] for doc in documents_for_indexing]
        print(f"Generating embeddings for {len(content_list)} content chunks... (This may take a while)")
        # Encode the content list into embeddings. show_progress_bar displays a progress bar.
        embeddings = embedding_model_instance.encode(content_list, show_progress_bar=True)
        print("Embeddings generated.")
        
        # Prepare points (documents with vectors and payloads) for upserting into Qdrant.
        points_to_upsert = [
            qdrant_models.PointStruct(
                id=doc["id"], # Unique ID for the point (must be UUID).
                vector=embeddings[i].tolist(), # The embedding vector.
                payload={"text_content": doc["content"], **doc["metadata"]} # Store content and metadata.
            )
            for i, doc in enumerate(documents_for_indexing)
        ]
        
        # Upsert the points to the Qdrant collection if any points were created.
        if points_to_upsert:
            print(f"Upserting {len(points_to_upsert)} points to Qdrant collection '{qdrant_collection_name}'...")
            qdrant_client_instance.upsert(collection_name=qdrant_collection_name, points=points_to_upsert, wait=True)
            print("Upsert complete.")
            # Get collection info to verify the number of points.
            collection_info = qdrant_client_instance.get_collection(collection_name=qdrant_collection_name)
            print(f"Collection '{qdrant_collection_name}' now contains {collection_info.points_count} points.")
        else:
            print("No points generated to upsert into Qdrant.")
            
    except Exception as e:
        # If any error occurs, print it and reset client/model instances.
        print(f"Error during vector indexing setup or embedding process: {e}")
        qdrant_client_instance = None 
        embedding_model_instance = None
else:
    print("\nSkipping vector indexing as no documents were processed or loaded for indexing.")


# Cell 7: Retrieval Function

# Function to search the vector database.
def search_vector_db(query_text: str, top_k: int = 5) -> list[dict]:
    # Check if Qdrant client and embedding model are initialized.
    if not qdrant_client_instance or not embedding_model_instance:
        print("ERROR in search_vector_db: Qdrant client or embedding model is not initialized. Cannot perform search.")
        return []
    # Check if the query text is valid.
    if not query_text or not isinstance(query_text, str):
        print("ERROR in search_vector_db: Query text is invalid.")
        return []
        
    try:
        # Encode the query text into an embedding.
        query_embedding = embedding_model_instance.encode([query_text])[0]
    except Exception as e:
        print(f"Error encoding query text in search_vector_db: {e}")
        return []
    
    try:
      # Perform the search operation in Qdrant.
      search_hits = qdrant_client_instance.search(
          collection_name=qdrant_collection_name,
          query_vector=query_embedding.tolist(), # The query embedding.
          limit=top_k, # Number of results to return.
          with_payload=True # Include the payload (metadata and content) in results.
      )
    except Exception as e:
      print(f"Error during Qdrant search operation: {e}")
      return []
    
    # Format the search results into a list of dictionaries.
    formatted_search_results = []
    for hit in search_hits:
        payload = hit.payload if hit.payload else {} # Get payload, default to empty dict if None.
        formatted_search_results.append({
            "id": str(hit.id), 
            "score": float(hit.score), # Similarity score.
            "content": payload.get("text_content", ""), # Get the text content from payload.
            "metadata": {k: v for k, v in payload.items() if k != "text_content"} # Get other metadata.
        })
    return formatted_search_results

print("`search_vector_db` function defined for retrieving documents.")

# Test the search function if components are ready and documents were indexed.
if qdrant_client_instance and embedding_model_instance and documents_for_indexing:
    try:
        collection_info_for_test = qdrant_client_instance.get_collection(collection_name=qdrant_collection_name)
        # Only test if the collection actually has points.
        if collection_info_for_test.points_count > 0:
            test_search_query = "hotel with excellent spa facilities and city view"
            print(f"\n--- Testing `search_vector_db` function ---")
            print(f"Test Query: \"{test_search_query}\"")
            retrieved_search_results = search_vector_db(test_search_query, top_k=2)
            if retrieved_search_results:
                print(f"Found {len(retrieved_search_results)} results for test query:")
                for i, res_item in enumerate(retrieved_search_results):
                    print(f"  Result {i+1}: ID: {res_item['id']}, Score: {res_item['score']:.4f}")
                    print(f"    Content (snippet): {res_item['content'][:120]}...")
                    print(f"    Metadata (source): {res_item['metadata'].get('source', 'N/A')}")
            else:
                print("No results found for the test search query or the search operation failed.")
        else:
            print("Skipping retrieval function test as Qdrant collection is empty.")
    except Exception as e_test_search:
        print(f"Could not perform retrieval function test due to an error: {e_test_search}")
else:
    print("\nSkipping retrieval function test: Qdrant client, embedding model, or indexed documents are not ready.")


# Cell 8: CrewAI & LLM Setup Imports

# Core CrewAI classes for defining agents, tasks, and crews.
from crewai import Agent, Task, Crew, Process
# Base class for creating custom tools for CrewAI agents.
from crewai.tools import BaseTool # Corrected import path.
# LangChain's OpenAI chat model wrapper.
from langchain_openai import ChatOpenAI
# os and OPENAI_API_KEY are assumed to be loaded from Cell 1.

# Initialize LLM variable.
llm_for_crewai = None 

# Check if the OpenAI API key is available.
if not OPENAI_API_KEY:
    print("ERROR: OPENAI_API_KEY environment variable not set. CrewAI agents cannot be initialized.")
else:
    # If key is available, try to initialize the ChatOpenAI model.
    try:
        llm_for_crewai = ChatOpenAI(
            model_name="gpt-3.5-turbo-0125", # Specify the OpenAI model.
            temperature=0.2, # Low temperature for more deterministic, less creative responses.
            openai_api_key=OPENAI_API_KEY # Pass the API key.
        )
        print("OpenAI LLM for CrewAI initialized (gpt-3.5-turbo-0125).")
    except Exception as e_llm_init:
        print(f"Error initializing OpenAI LLM for CrewAI: {e_llm_init}. Check your API key and relevant package versions.")


# Cell 9: Retrieval Agent Tool Setup

# Initialize the CrewAI tool variable.
crewai_vector_db_tool = None 
try:
    # Check if the search_vector_db function (from Member A's part) is defined.
    search_vector_db 
    print("\n`search_vector_db` function from Member A's work is available.")

    # Define a custom CrewAI tool that wraps the search_vector_db function.
    class VectorDBQueryToolForCrew(BaseTool):
        name: str = "Travel Information Vector Database Query Tool" # Name of the tool.
        description: str = ( # Description for the LLM to understand how to use the tool.
            "Use this specialized tool to query the travel vector database. "
            "Input MUST be the user's specific query string. "
            "The tool will find relevant hotel reviews or travel information snippets."
        )

        # The main execution method for the tool.
        def _run(self, user_query: str) -> str:
            # Validate the input query.
            if not isinstance(user_query, str) or not user_query.strip():
                return "Error: Invalid input. The user query must be a non-empty string."
            
            print(f"[VectorDBQueryToolForCrew._run] Received query for DB search: '{user_query}'")
            # Call the underlying search function.
            search_results_from_db = search_vector_db(query_text=user_query, top_k=3) 
            
            # If no results, inform the agent.
            if not search_results_from_db:
                return "No relevant information snippets were found in the database for this specific query."
            
            # Format the results into a string for the agent.
            formatted_tool_output = "Retrieved Information Snippets (Format: [Source, Rating, Relevance Score] Content):\n"
            for i, res_item in enumerate(search_results_from_db):
                content_snippet = res_item.get('content', 'N/A')
                metadata_info = res_item.get('metadata', {})
                score_val = res_item.get('score', 0.0)
                source_info = metadata_info.get('source', 'Unknown')
                rating_info = metadata_info.get('rating', 'N/A')
                # Append each result with its metadata and a snippet of content.
                formatted_tool_output += f"Snippet {i+1}: [{source_info}, Rating: {rating_info}, Score: {score_val:.3f}] {content_snippet[:300]}...\n---\n"
            return formatted_tool_output

    # Instantiate the custom tool.
    crewai_vector_db_tool = VectorDBQueryToolForCrew()
    print("VectorDBQueryToolForCrew (CrewAI tool) created successfully.")

    # Test the tool directly.
    print("\n--- Testing the CrewAI Tool directly ---")
    test_query_for_crewai_tool_instance = "any good hotels in downtown with a gym and free breakfast?"
    print(f"Tool Test Query: '{test_query_for_crewai_tool_instance}'")
    # Run the tool with the test query.
    retrieved_info_from_tool_test = crewai_vector_db_tool.run(test_query_for_crewai_tool_instance)
    print(f"Tool Retrieved Info (first 600 chars):\n{retrieved_info_from_tool_test[:600]}...")

except NameError:
    # This error occurs if search_vector_db wasn't defined (e.g., previous cell not run).
    print("CRITICAL ERROR: `search_vector_db` function from Member A is not defined. Cannot create CrewAI tool. Please ensure Member A's cells are run successfully first.")
except Exception as e_tool_setup:
    print(f"An unexpected error occurred during CrewAI tool setup: {e_tool_setup}")


# Cell 10: Define CrewAI Agents

# Initialize agent variables.
crewai_retriever_agent = None
crewai_summarizer_agent = None
crewai_composer_agent = None

# Proceed only if the LLM and the custom tool are initialized.
if llm_for_crewai and crewai_vector_db_tool:
    print("\nDefining CrewAI agents...")
    # Define the Retriever Agent.
    crewai_retriever_agent = Agent(
        role='Travel Information Retrieval Specialist',
        goal=("Efficiently search the travel vector database using the 'Travel Information Vector Database Query Tool'. "
              "Your sole objective is to pass the user's query to this tool and return its exact output."),
        backstory=( # Provides context for the agent's persona and how it should behave.
            "You are an AI assistant specialized in data retrieval. You have one tool: 'Travel Information Vector Database Query Tool'."
            "When given a user query, you must use this tool by providing the query as input. Do not attempt to answer the query yourself; only use the tool."
        ),
        tools=[crewai_vector_db_tool], # List of tools available to this agent.
        llm=llm_for_crewai, # The LLM instance this agent will use.
        verbose=True, # Enables detailed logging of the agent's thought process.
        allow_delegation=False, # This agent cannot delegate tasks to other agents.
        memory=False # This agent does not have persistent memory between tasks (stateless for this run).
    )
    print("CrewAI Retriever Agent defined.")

    # Define the Summarizer Agent.
    crewai_summarizer_agent = Agent(
        role='Information Synthesis Expert',
        goal=("Take the collection of retrieved text snippets (output from the Retriever Agent) and the original user query. "
              "Identify the most critical information relevant to the query, eliminate redundancy, and produce a concise, factual summary."),
        backstory=(
            "You are an AI assistant skilled in processing and synthesizing textual information from multiple sources. "
            "You receive raw text snippets. Your task is to distill these into a coherent summary that directly addresses the user's needs as stated in their original query. Focus on facts and key opinions."
        ),
        llm=llm_for_crewai, # Uses the same LLM instance.
        verbose=True,
        allow_delegation=False,
        memory=False
    )
    print("CrewAI Summarizer Agent defined.")

    # Define the Composer Agent.
    crewai_composer_agent = Agent(
        role='Travel Recommendation Composer and Advisor',
        goal=("Generate a helpful, well-formatted, and user-friendly travel recommendation or response. "
              "This response should be based *only* on the synthesized summary provided by the Summarizer Agent and should directly address the user's original query. "
              "Adopt a friendly, knowledgeable travel assistant persona."),
        backstory=(
            "You are a creative and articulate AI travel assistant. You receive a concise summary of relevant information. "
            "Your responsibility is to craft this summary into an engaging and practical piece of travel advice, an itinerary suggestion, or a direct recommendation. "
            "Pay close attention to tone (friendly, helpful), clarity, and formatting (e.g., use bullet points, bolding for emphasis if it enhances readability)."
        ),
        llm=llm_for_crewai,
        verbose=True,
        allow_delegation=False,
        memory=False
    )
    print("CrewAI Composer Agent defined.")
else:
    print("\nSkipping CrewAI agent definitions: LLM for CrewAI or the CrewAI Vector DB Tool is not initialized.")


# Cell 11: Define Main Function for Crew Execution & Demo

# Main function to process a user query using the CrewAI setup.
def get_travel_recommendation_crewai(user_query: str) -> str:
    # Check if all agents are initialized.
    if not all([crewai_retriever_agent, crewai_summarizer_agent, crewai_composer_agent]):
        return "Error: One or more CrewAI agents are not initialized. Please check the setup and logs."
    # Validate the user query.
    if not isinstance(user_query, str) or not user_query.strip():
        return "Error: User query is invalid (empty or not a string)."

    print(f"\n--- Initiating CrewAI Process for Query: '{user_query}' ---")

    # Define the Retrieval Task.
    retrieval_task = Task(
        description=( # Detailed description of what the agent needs to do for this task.
            f"A user is asking the following travel-related question: '{user_query}'. "
            f"Your primary objective is to use the 'Travel Information Vector Database Query Tool' by providing it with this exact query: '{user_query}'. "
            f"Ensure you extract all relevant text snippets from the database that could help answer this query."
        ),
        expected_output=( # What the successful completion of this task should produce.
            "A string containing several text snippets retrieved from the vector database via the tool. "
            "Each snippet should be clearly demarcated. This output will be passed to the Summarizer Agent for further processing."
        ),
        agent=crewai_retriever_agent, # Assign this task to the Retriever Agent.
    )

    # Define the Summarization Task.
    summarization_task = Task(
        description=(
            f"You have received a collection of retrieved text snippets related to the user's original query: '{user_query}'. "
            f"Your task is to carefully analyze these snippets, identify the most critical and relevant pieces of information, "
            f"eliminate any redundancy or irrelevant details, and then synthesize a concise, factual summary. "
            f"The summary must directly address the key aspects of the user's query."
        ),
        expected_output=(
            "A short, coherent, and neutral summary of the most important facts and opinions extracted from the retrieved texts. "
            "This summary will be used by the Composer Agent to formulate the final user response."
        ),
        agent=crewai_summarizer_agent, # Assign to the Summarizer Agent.
        context=[retrieval_task] # This task depends on the output of retrieval_task.
    )

    # Define the Composition Task.
    composition_task = Task(
        description=(
            f"You have received a concise summary of information pertinent to the user's query: '{user_query}'. "
            f"Your task is to craft a helpful, friendly, and well-formatted travel recommendation or direct answer for the user. "
            f"Use only the information from the provided summary. Adopt a knowledgeable travel assistant persona. "
            f"Format the response clearly, using bullet points or paragraphs as appropriate for readability."
        ),
        expected_output=(
            "The final, polished, user-facing response. It should be well-written, directly answer the user's query, "
            "provide practical recommendations if applicable, and be formatted for easy understanding. This is the ultimate output the user will see."
        ),
        agent=crewai_composer_agent, # Assign to the Composer Agent.
        context=[summarization_task] # Depends on the output of summarization_task.
    )

    # Create the Crew.
    travel_recommendation_crew = Crew(
        agents=[crewai_retriever_agent, crewai_summarizer_agent, crewai_composer_agent], # List of agents in the crew.
        tasks=[retrieval_task, summarization_task, composition_task], # List of tasks for the crew to execute.
        process=Process.sequential, # Tasks will be executed one after another in the order they are listed.
        verbose=True, # Enable verbose logging for the crew's execution. (Changed from 2 to True for boolean type)
        memory=False # The crew itself does not maintain memory across different kickoff calls.
    )

    print("\nKicking off the CrewAI travel recommendation process...")
    try:
        # Start the crew's execution process.
        crew_execution_result = travel_recommendation_crew.kickoff()
        print("CrewAI process execution finished successfully.")
        return str(crew_execution_result) # Return the final result as a string.
    except Exception as e_crew_kickoff:
        # Handle any errors during the crew's execution.
        print(f"CRITICAL ERROR during CrewAI kickoff or execution: {e_crew_kickoff}")
        return f"I'm sorry, an internal error occurred while processing your request with the AI crew. Details: {str(e_crew_kickoff)}"

print("`get_travel_recommendation_crewai` function defined and ready for use.")

# Demo run: Check if all necessary components are initialized before running.
if all([crewai_retriever_agent, crewai_summarizer_agent, crewai_composer_agent, OPENAI_API_KEY, llm_for_crewai]) :
    print("\n--- Running CrewAI Demo with Example Query 1 ---")
    example_user_query_crew_1 = "I want to find a quiet, charming boutique hotel in Paris, preferably in the Latin Quarter or Marais, with good reviews for cleanliness."
    # Call the main function with the first example query.
    final_response_from_crew_1 = get_travel_recommendation_crewai(example_user_query_crew_1)
    print(f"\n--- Final Response for Query: '{example_user_query_crew_1}' ---")
    print(final_response_from_crew_1)

    print("\n" + "="*80 + "\n") # Separator for readability.

    print("\n--- Running CrewAI Demo with Example Query 2 ---")
    example_user_query_crew_2 = "Suggest some unique cultural experiences or hidden gems in Rome for a solo traveler interested in history but wants to avoid huge crowds."
    # Call the main function with the second example query.
    final_response_from_crew_2 = get_travel_recommendation_crewai(example_user_query_crew_2)
    print(f"\n--- Final Response for Query: '{example_user_query_crew_2}' ---")
    print(final_response_from_crew_2)
else:
    # If components are not ready, skip the demo.
    print("\nSkipping CrewAI demo run: One or more critical components (agents, OpenAI API key, LLM) are not initialized. Please check previous cells for errors.")

# --- END OF FILE chatgpt_commented.ipynb (Python Code) ---

/usr/local/bin/python3
2.19.0
3.9.2
2.19.0
OPENAI_API_KEY loaded successfully (from environment or .env file).
NLTK 'punkt' resource found.
NLTK 'punkt_tab' resource found.
Dataset 'tripadvisor_hotel_reviews.csv' loaded successfully. Shape: (20491, 2)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB

First 5 rows of the dataset:
                                              Review  Rating
0  nice hotel expensive parking got good deal sta...       4
1  ok nothing special charge diamond member hilto...       2
2  nice rooms not 4* experience hotel monaco seat...       3
3  unique, great stay, wonderful time hotel monac...       5
4  great stay great stay, went seahawk game aweso...       5

Cleaning reviews...
Cleaned revie

  qdrant_client_instance.recreate_collection(


Batches:   0%|          | 0/641 [00:00<?, ?it/s]

Embeddings generated.
Upserting 20494 points to Qdrant collection 'travel_guide_rag_collection_v2'...


  qdrant_client_instance.upsert(collection_name=qdrant_collection_name, points=points_to_upsert, wait=True)


Upsert complete.
Collection 'travel_guide_rag_collection_v2' now contains 20494 points.
`search_vector_db` function defined for retrieving documents.

--- Testing `search_vector_db` function ---
Test Query: "hotel with excellent spa facilities and city view"


  search_hits = qdrant_client_instance.search(


Found 2 results for test query:
  Result 1: ID: c7dd1a80-e4e2-48c4-8c02-2cccaa66ea4c, Score: 0.6264
    Content (snippet): greatplace,hotelstreetfrancatrainstationgreattravellingtrain,closemaincentrebeachesshopssiteswalkingdistance,hotelcleane...
    Metadata (source): tripadvisor_review_10146
  Result 2: ID: c9505d1e-14d7-49fd-9bb5-5478a4b92d54, Score: 0.6082
    Content (snippet): bestbestreasonaffordablebetterhotelscityseattle,greattimestaygreatservicefriendlyemployees,locationconvenientparkingchea...
    Metadata (source): tripadvisor_review_20473
OpenAI LLM for CrewAI initialized (gpt-3.5-turbo-0125).

`search_vector_db` function from Member A's work is available.
VectorDBQueryToolForCrew (CrewAI tool) created successfully.

--- Testing the CrewAI Tool directly ---
Tool Test Query: 'any good hotels in downtown with a gym and free breakfast?'
Using Tool: Travel Information Vector Database Query Tool
[VectorDBQueryToolForCrew._run] Received query for DB search: 'any good hotels in d

  search_hits = qdrant_client_instance.search(


Tool Retrieved Info (first 600 chars):
Retrieved Information Snippets (Format: [Source, Rating, Relevance Score] Content):
Snippet 1: [tripadvisor_review_4248, Rating: 5, Score: 0.517] greathotellocationstayednightweekaugust,hotelcleanbeautiful,locationgreatrighteatoncentre,easyconnectionsubwayeasywalkdowntown,parkcarforgetit.alittlepriceyvacationerroom240taxesbeatsstayingoutsidecitybuckingtrafficmakingquestionablereservationhotelcitynotfeelsecure.go,...
---
Snippet 2: [tripadvisor_review_2368, Rating: 5, Score: 0.506] greathotel,stayed5daybreak,roomtowersgreatview,hotelstaffpoliteroomscleangreatlocation,...
---
Snippet 3: [tripadv...

Defining CrewAI agents...
CrewAI Retriever Agent defined.
CrewAI Summarizer Agent defined.
CrewAI Composer Agent defined.
`get_travel_recommendation_crewai` function defined and ready for use.

--- Running CrewAI Demo with Example Query 1 ---

--- Initiating CrewAI Process for Query: 'I want to find a quiet, charming boutique hotel in Paris, preferably i

Output()