In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Change directory to the folder containing your notebook or dataset
%cd /content/drive/My\ Drive/Colab\ Notebooks/

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks


In [None]:
import pandas as pd
import numpy as np

# Load files
rfm = pd.read_csv("rfm.xls")
context_recon = pd.read_csv("contextual_policy_recommendations.xls")
context_policy = pd.read_csv("contextual_policy_summary.xls")
context_action = pd.read_csv("contextual_policy_tier_action_mix.xls")
retail = pd.read_csv("Online_retail_cleaned.xls")

Google API key Installation

In [None]:
import os
from dotenv import load_dotenv

# Try to load the API key from a .env file.
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

# If the key is not found, prompt the user to enter it.
if not api_key:
    from getpass import getpass
    api_key = getpass("Please enter your Google API key: ")
    os.environ['GOOGLE_API_KEY'] = api_key

    # Check if the key was entered
    if not api_key:
        raise ValueError("API key not entered. Please provide your key.")

# You can now proceed with initializing the client
# The `api_key` variable is now guaranteed to exist for this session.
print("API key successfully loaded for this session.")


Please enter your Google API key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
API key successfully loaded for this session.


**Generating Natural Language Customer RFM Summaries**

This code snippet iterates through a pandas DataFrame named rfm, which presumably contains the results of a Recency, Frequency, and Monetary (RFM) analysis. For every customer record, it extracts their RFM metrics and their churn status (from the Churn_Label column). It then uses an f-string to generate a descriptive, human-readable text summary of the customer's behavior and classification. These summaries are collected into a list called rfm_docs, effectively converting structured numerical data into a list of natural language documents, before printing the first five for review.

In [None]:
import pandas as pd

rfm_docs = []
for index, row in rfm.iterrows():
    customer_id = row['Customer ID']
    recency = row['Recency']
    frequency = row['Frequency']
    monetary = row['Monetary']
    churn = 'churned' if row['Churn_Label'] == 1 else 'not churned'

    text = f"Customer {customer_id} has a recency of {recency} days, a purchase frequency of {frequency} times, and a total monetary spend of ${monetary:.2f}. This customer is classified as {churn}."
    rfm_docs.append(text)

# You can now see the first few documents
print(rfm_docs[:5])

['Customer 12346 has a recency of 66 days, a purchase frequency of 14 times, and a total monetary spend of $372.86. This customer is classified as churned.', 'Customer 12347 has a recency of 2 days, a purchase frequency of 2 times, and a total monetary spend of $1323.32. This customer is classified as not churned.', 'Customer 12348 has a recency of 73 days, a purchase frequency of 1 times, and a total monetary spend of $222.16. This customer is classified as churned.', 'Customer 12349 has a recency of 42 days, a purchase frequency of 3 times, and a total monetary spend of $2064.39. This customer is classified as churned.', 'Customer 12351 has a recency of 10 days, a purchase frequency of 1 times, and a total monetary spend of $300.93. This customer is classified as not churned.']


**Generating Contextual Policy Recommendation Summaries**

This code snippet processes a DataFrame named context_recon (which contains contextual policy recommendations). It iterates over each customer record to extract the Customer_ID, the Chosen_Action (the recommended retention strategy), and the Estimated_Reward (the projected financial return, or ROI) for that action. It then converts this structured data into a descriptive text format, creating a sentence that summarizes the recommended action and its predicted ROI for each specific customer. Finally, these text summaries are collected in a list called policy_docs, with the first five documents being printed.

In [None]:
import pandas as pd

# Assuming you've already loaded the dataframe
# context_recon = pd.read_csv("contextual_policy_recommendations.csv")

policy_docs = []
for index, row in context_recon.iterrows():
    customer_id = row['Customer_ID']
    action = row['Chosen_Action']
    reward = row['Estimated_Reward']

    text = f"For customer {customer_id}, the recommended retention action is to use '{action}'. This action has a projected ROI of ${reward:.2f}."
    policy_docs.append(text)

print(policy_docs[:5])

["For customer 12346, the recommended retention action is to use 'call+coupon'. This action has a projected ROI of $2.15.", "For customer 12347, the recommended retention action is to use 'email'. This action has a projected ROI of $0.00.", "For customer 12348, the recommended retention action is to use 'none'. This action has a projected ROI of $0.00.", "For customer 12349, the recommended retention action is to use 'sms'. This action has a projected ROI of $2.99.", "For customer 12351, the recommended retention action is to use 'sms+coupon'. This action has a projected ROI of $0.00."]


**Generating Contextual Policy Action Summaries**

This code snippet aggregates a summary of recommended actions from a DataFrame named context_policy. It iterates through each row, extracting the specific retention action (Chosen_Action), the average projected return on investment (Average_Reward), and the volume (number of customers) assigned to that action. The code then compiles this information into a list of descriptive text documents (summary_docs), which clearly state how many customers were given a particular action and what the action's overall average projected financial benefit is, before printing the resulting summary documents.

In [None]:
import pandas as pd

# Assuming you've already loaded the dataframe
# context_policy = pd.read_csv("contextual_policy_summary.csv")

summary_docs = []
for index, row in context_policy.iterrows():
    action = row['Chosen_Action']
    avg_reward = row['Average_Reward']
    volume = row['Customers']

    text = f"The retention action '{action}' was assigned to {volume} customers, with an average projected ROI of ${avg_reward:.2f}."
    summary_docs.append(text)

print(summary_docs)

["The retention action 'call+coupon' was assigned to 2247 customers, with an average projected ROI of $3.74.", "The retention action 'sms+coupon' was assigned to 803 customers, with an average projected ROI of $2.13.", "The retention action 'sms' was assigned to 359 customers, with an average projected ROI of $0.05.", "The retention action 'email' was assigned to 462 customers, with an average projected ROI of $0.00.", "The retention action 'none' was assigned to 418 customers, with an average projected ROI of $0.00."]


**Extracting All Textual Content**

This code snippet's purpose is to read and extract all the textual content‚Äîboth markdown and code‚Äîfrom a Jupyter Notebook file. It first uses the json library to load the raw structure of the notebook specified by notebook_path. It then iterates through every cell in the notebook, checks if the cell contains source code or text (in the source field), joins all the lines from that cell, and concatenates them into a single, continuous string named notebook_text. This effectively flattens the notebook's content into a searchable text document, with the final line printing the first 500 characters to verify the extraction.

In [None]:
import json

notebook_path = "E-commerce_1_1.ipynb"
with open(notebook_path, 'r', encoding='utf-8') as f:
    notebook_content = json.load(f)

# Concatenate all text and code cells into a single string
notebook_text = ""
for cell in notebook_content['cells']:
    if 'source' in cell and isinstance(cell['source'], list):
        notebook_text += "".join(cell['source']) + "\n\n"

# You can now print the first 500 characters to verify
print(notebook_text[:500])

**Loading Dataset & Info**

This Python code snippet defines a function, process_online_retail_data, that downloads, cleans, and transforms the "Online Retail" dataset from a UCI Machine Learning repository URL. It first uses the requests library to fetch the Excel file and pandas to load it into a DataFrame. The function then performs extensive data cleaning‚Äîincluding dropping missing CustomerID values, removing duplicates, filtering out non-positive unit prices, and handling outliers based on 


In [None]:
# rfm_docs, policy_docs, summary_docs

all_docs = rfm_docs + policy_docs + summary_docs + [notebook_text]

print(f"Total documents in your knowledge base: {len(all_docs)}")

Total documents in your knowledge base: 8584


In [None]:
import google.generativeai as genai
import os

# Ensure your API key is configured
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

# List all available models and print their names
for model in genai.list_models():
    print(model.name)


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-flash-preview
models/gemini-3.1-pro-preview
models/gemini-3.1-pro-preview-customtools
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robotics-er-1.5-preview
models/gemini-2.5-computer-use-preview-10-2025
models/deep-research-pro-preview-12-2025
models/gemini-embedding-001
models/aqa
models/imagen-4.0-generate-001
models/imagen-

In [None]:
# Install necessary libraries for RAG components
!pip install -qU chromadb langchain-text-splitters

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m52.0/52.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m21.5/21.5 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m278.2/278.2 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m17.1/17.1 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K

**Structuring Documents for Vector Indexing (RAG Preparation)**

This code snippet is performing a crucial data preparation step for a Retrieval-Augmented Generation (RAG) system by intelligently structuring documents for vector indexing. It first separates a single, long document (notebook_text, which is the entire content of the Colab notebook) from a list of shorter, specific summary documents (short_docs). It then uses the RecursiveCharacterTextSplitter to break the long notebook text into smaller, overlapping chunks (1000 characters with 200 character overlap) to ensure comprehensive context is preserved across splits. Finally, it combines these newly chunked notebook sections with the original short summary documents (like the RFM and policy recommendation texts) into a unified list called final_documents, ready to be indexed for efficient retrieval by a conversational AI.

In [None]:
# Preprocessing and Chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Separate the long notebook text from the rest of the documents
notebook_text = all_docs[-1]
short_docs = all_docs[:-1]

# 1. Chunk the long notebook document
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
chunked_notebook_docs = text_splitter.create_documents([notebook_text])

# Convert LangChain Document objects back to simple strings
chunked_text_list = [doc.page_content for doc in chunked_notebook_docs]

# 2. Combine all documents: short, specific documents + chunked notebook text
final_documents = short_docs + chunked_text_list

print(f"Total documents after chunking: {len(final_documents)}")
print(f"Example chunk: {final_documents[-1][:150]}...")

Total documents after chunking: 8746
Example chunk: ### 9. Future Work

*   **Model Retraining:** Regularly retrain the model with fresh data to capture evolving customer behavior.
*   **Feature Expansi...


**Vectorizing and Indexing Knowledge Base (RAG System Setup)**

This code snippet performs the critical task of vectorizing and indexing a list of documents to create a knowledge base for a Retrieval-Augmented Generation (RAG) system. It initializes the Google Gemini client and defines the gemini-embedding-001 model for vector creation. It then sets up a persistent ChromaDB vector store in a local directory. The code proceeds to iterate through the final_documents (the combined, chunked text) in batches. For each batch, it calls the Gemini API to generate high-quality vector embeddings optimized for document retrieval. Finally, it stores these embeddings, along with their original text content and unique IDs, into the ChromaDB collection, thereby making the knowledge base searchable via semantic similarity.

In [None]:
# Creating and Indexing Embeddings
import google.generativeai as genai
import chromadb
import os

# Initialize the Gemini Client (already configured from cell 9)
# Ensure os.getenv('GOOGLE_API_KEY') is available.

# 1. Define the embedding model
EMBEDDING_MODEL = 'models/gemini-embedding-001'

# 2. Setup the ChromaDB Client and Collection
# This creates a directory 'chroma_db_ecommerce' to store the vector database
client = chromadb.PersistentClient(path="./chroma_db_ecommerce")
collection = client.get_or_create_collection(
    name="ecommerce_rag_knowledge_base",
)

# 3. Create Embeddings in Batches and Store in ChromaDB
# For performance, we generate and store embeddings in small batches.
batch_size = 100
for i in range(0, len(final_documents), batch_size):
    batch_docs = final_documents[i:i + batch_size]
    batch_ids = [f"doc_{j}" for j in range(i, i + len(batch_docs))]

    # Generate embeddings using the Gemini API
    # We use a list comprehension to handle the response structure
    result = genai.embed_content(
        model=EMBEDDING_MODEL,
        content=batch_docs,
        task_type="RETRIEVAL_DOCUMENT" # Optimizes embeddings for RAG retrieval
    )
    batch_embeddings = result['embedding']

    # Add the embeddings and text to ChromaDB
    collection.add(
        embeddings=batch_embeddings,
        documents=batch_docs,
        ids=batch_ids
    )

print(f"Successfully indexed {collection.count()} documents.")
print("Vector Store is ready for Retrieval.")

Successfully indexed 8763 documents.
Vector Store is ready for Retrieval.


**Implementing the Core RAG Chatbot Query Function**

This code snippet defines and demonstrates the core function of a Retrieval-Augmented Generation (RAG) chatbot designed for E-commerce analytics. The rag_chat_query function takes a user question and performs a two-step process: Retrieval and Generation.

Retrieval: It first converts the user's question into a vector embedding using the Gemini API. It then uses this vector to query the pre-built ChromaDB vector store for the top_k most semantically relevant text chunks from the knowledge base, which become the context.

Generation: It augments a prompt by combining the retrieved context with the original user question, and applies a system instruction to act as an expert E-commerce chatbot. Finally, it passes this complete prompt to the Gemini-2.5-flash model, which generates a concise answer based only on the provided context, thus ensuring grounded and factual responses.

In [None]:
# Define the RAG Query Function
def rag_chat_query(user_query: str, top_k: int = 3, llm_model: str = 'models/gemini-2.5-flash'):
    # 1. Retrieval: Convert query to embedding and search the vector store

    # Generate the embedding for the user's query
    query_embedding_result = genai.embed_content(
        model=EMBEDDING_MODEL,
        content=[user_query],
        task_type="RETRIEVAL_QUERY" # Optimizes query for retrieval
    )
    query_embedding = query_embedding_result['embedding'][0]

    # Use the vector store to search for similar documents (context)
    retrieved_results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=['documents']
    )

    # Combine retrieved documents into a single context string
    retrieved_context = "\n---\n".join(retrieved_results['documents'][0])

    # 2. Augmentation & Generation: Build the prompt and call the LLM

    # Define a system instruction for the LLM
    system_instruction = (
        "You are an expert E-commerce Customer Retention and Analytics Chatbot. "
        "Your task is to answer user questions strictly based on the provided CONTEXT. "
        "Do not use external knowledge. Be concise and professional."
    )

    # Create the final prompt with the retrieved context
    prompt = f"""
    CONTEXT:
    ---
    {retrieved_context}
    ---

    QUESTION: {user_query}

    ANSWER:
    """

    # Call the Gemini LLM to generate the final response
    response = genai.GenerativeModel(
        model_name=llm_model,
        system_instruction=system_instruction
    ).generate_content(prompt)

    return response.text, retrieved_context

# Example Query
user_question = "What is the recommended retention action and projected ROI for customer 12349, and what is their recency?"
answer, context = rag_chat_query(user_question, top_k=5)

print(f"User Question: {user_question}")
print(f"\n--- Chatbot Answer ---\n{answer}")
print(f"\n--- Context Used for Answer ---\n{context}")

User Question: What is the recommended retention action and projected ROI for customer 12349, and what is their recency?

--- Chatbot Answer ---
For customer 12349, the recommended retention action is 'sms' with a projected ROI of $2.99. Information regarding recency is not available in the provided context.

--- Context Used for Answer ---
For customer 12349, the recommended retention action is to use 'sms'. This action has a projected ROI of $2.99.
---
For customer 12549, the recommended retention action is to use 'call+coupon'. This action has a projected ROI of $7.56.
---
For customer 17349, the recommended retention action is to use 'call+coupon'. This action has a projected ROI of $3.18.
---
For customer 13249, the recommended retention action is to use 'call+coupon'. This action has a projected ROI of $3.87.
---
For customer 14549, the recommended retention action is to use 'call+coupon'. This action has a projected ROI of $4.20.


**Merging Customer Profiles with Contextual Policy Recommendations**

This code snippet's primary function is to integrate two separate customer datasets‚Äîone containing Recency, Frequency, and Monetary (RFM) metrics (rfm) and the other containing contextual policy recommendations (context_recon)‚Äîinto a single comprehensive DataFrame. It first attempts to load both dataframes, then renames the customer identifier column in the recommendation data (Customer_ID to Customer ID) for consistency. It then performs a left merge using 'Customer ID' as the key, effectively linking each customer's purchasing behavior (RFM) with their assigned retention action and projected return. Finally, it previews the new combined structure and saves the resulting dataset as merged_customer_profiles.csv.

In [None]:
import pandas as pd
import numpy as np

# Re-load dataframes to ensure they are available
try:
    rfm = pd.read_csv("rfm.xls")
    context_recon = pd.read_csv("contextual_policy_recommendations.xls")
except FileNotFoundError as e:
    print(f"Error loading files. Ensure 'rfm.csv' and 'contextual_policy_recommendations.csv' are in the directory: {e}")
    # Handle error or exit

# Rename the key column in context_recon to match rfm before merging
context_recon = context_recon.rename(columns={'Customer_ID': 'Customer ID'})

# Merge the two dataframes on 'Customer ID'
merged_customer_data = pd.merge(rfm, context_recon, on='Customer ID', how='left')

# Preview the new merged data structure
print("--- Merged Data Head ---")
print(merged_customer_data.head())

# Save the merged DataFrame for inspection (optional)
merged_customer_data.to_csv("merged_customer_profiles.csv", index=False)

--- Merged Data Head ---
   Customer ID     LastPurchaseDate  Recency_x  Frequency_x  Monetary_x  \
0        12346  2010-10-04 16:33:00         66           14      372.86   
1        12347  2010-12-07 14:57:00          2            2     1323.32   
2        12348  2010-09-27 14:59:00         73            1      222.16   
3        12349  2010-10-28 08:23:00         42            3     2064.39   
4        12351  2010-11-29 15:23:00         10            1      300.93   

   Churn_Label Risk_Tier         Country  Recency_y  Frequency_y  Monetary_y  \
0            1    Medium  United Kingdom         66           14      372.86   
1            0       NaN         Iceland          2            2     1323.32   
2            1       NaN         Finland         73            1      222.16   
3            1    Medium           Italy         42            3     2064.39   
4            0       NaN     Unspecified         10            1      300.93   

  Chosen_Action  Chosen_Score  Estimated_Re

**Generating Comprehensive Customer Profile Documents**

This code snippet's purpose is to create comprehensive, single-source documents for each customer by combining their behavioral data with their strategic recommendations. It iterates through the merged_customer_data DataFrame, which contains both RFM metrics (Recency, Frequency, Monetary) and policy recommendations (Chosen Action, Estimated Reward). For every row, it extracts all these distinct features and synthesizes them into one rich, natural-language string. These detailed profile strings are collected into the merged_docs list, which are highly informative documents ready for use in advanced downstream systems like a Retrieval-Augmented Generation (RAG) knowledge base.

In [None]:
# Create new documents from the merged DataFrame
merged_docs = []
for index, row in merged_customer_data.iterrows():
    customer_id = row['Customer ID']
    recency = row['Recency_x']
    frequency = row['Frequency_x']
    monetary = row['Monetary_x']
    churn = 'churned' if row['Churn_Label'] == 1 else 'not churned'
    action = row['Chosen_Action']
    reward = row['Estimated_Reward']

    # Combine all information into one single, rich document
    text = (
        f"Customer Profile {customer_id}: "
        f"Recency is {recency} days, Frequency is {frequency} times, "
        f"and Monetary spend is ${monetary:.2f}. "
        f"This customer is classified as {churn}. "
        f"The recommended retention action is to use '{action}' "
        f"with a projected ROI of ${reward:.2f}."
    )
    merged_docs.append(text)

# Example of the improved document for customer 12349
# (The index for 12349 is 3 based on the printout from the first notebook cell)
print("\n--- Example of Improved Document for Customer 12349 ---")
print(merged_docs[3])


--- Example of Improved Document for Customer 12349 ---
Customer Profile 12349: Recency is 42 days, Frequency is 3 times, and Monetary spend is $2064.39. This customer is classified as churned. The recommended retention action is to use 'sms' with a projected ROI of $2.99.


**Initiating and Running the Interactive RAG Chatbot Session**

This code snippet defines the function start_chatbot_session(), which establishes and runs a continuous, interactive conversational loop for the E-commerce Retrieval-Augmented Generation (RAG) system. The function initializes the user interface, prompting the user for input. Inside a while loop, it accepts a user query, checks for exit commands (quit or exit), and then passes the input to the previously defined rag_chat_query function. This query function retrieves relevant context from the vector database and uses the Gemini LLM to generate an answer. The loop continuously prints the chatbot's response and, for transparency, the source context used to generate that answer, until the user explicitly terminates the session.

In [None]:
# Define and Run the Continuous Chat Loop
import os
import google.generativeai as genai

# NOTE: The rag_chat_query function and the 'collection' object are assumed
# to be defined and configured from the previous successful steps.
# The EMBEDDING_MODEL is 'models/gemini-embedding-001' and LLM is 'models/gemini-2.5-flash'.

def start_chatbot_session():
    """Initializes and runs the continuous RAG chatbot session."""

    print("--- E-commerce RAG Chatbot Initialized ---")
    print("Ask a question about customer data or retention policies.")
    print("Type 'quit' or 'exit' to end the session.\n")

    # Main conversational loop
    while True:
        # Get user input
        user_input = input("You: ")

        # Check for exit commands
        if user_input.lower() in ["quit", "exit"]:
            print("\nChatbot session ended. Goodbye!")
            break

        if not user_input.strip():
            continue

        try:
            # Call the RAG function (from the previous step)
            # We use top_k=1 since the customer data is now fully merged
            # For general policy questions, top_k can be higher (e.g., 3)
            answer, context = rag_chat_query(user_input, top_k=3)

            print(f"\nü§ñ Chatbot: {answer}")

            # Optionally, show the source context for verification
            print("\n[Source Context Retrieved]:")
            print(context)
            print("-------------------------------------------\n")

        except Exception as e:
            print(f"\n‚ùå An error occurred: {e}. Please try again.")


**Interactive DashBoard**

In [None]:
# Start the Chatbot
start_chatbot_session()

--- E-commerce RAG Chatbot Initialized ---
Ask a question about customer data or retention policies.
Type 'quit' or 'exit' to end the session.

You: Which customers have the highest Recency scores in the dataset?

ü§ñ Chatbot: All listed customers (12481, 17841, 18041) have a Recency score of 0 days.

[Source Context Retrieved]:
Customer 12481 has a recency of 0 days, a purchase frequency of 7 times, and a total monetary spend of $6171.07. This customer is classified as not churned.
---
Customer 17841 has a recency of 0 days, a purchase frequency of 123 times, and a total monetary spend of $26664.52. This customer is classified as not churned.
---
Customer 18041 has a recency of 0 days, a purchase frequency of 21 times, and a total monetary spend of $4520.34. This customer is classified as not churned.
-------------------------------------------

You: Identify customers with high Monetary value but low Frequency.

ü§ñ Chatbot: Customer 15478 has a monetary spend of $583.56 and a purc

Python code for 'dashboard_app.py' (do not run here)

Open a .txt file in Desktop and save the code of 'dashboard_app.py' and name the .txt fille accordingly as 'dashboard_app.py'

**We have to use Powershell 7 for command/prompt**

1. copy this part 'pip install streamlit pandas numpy plotly' and press enter
2. type 'cd OneDrive' and press enter
3. type 'cd Desktop' and press enter
4. type 'streamlit run dashboard_app.py' and see the magic


1. $env:GOOGLE_API_KEY = "your-real-key"
2. python prepare_vector_store.py ...
3. streamlit run streamlit_app.py
