In [None]:
from google.colab import files
files.upload()  # Upload kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nakulverma2","key":"b9e95e18361f6df22da39d0a577d9813"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip install kagglehub pandas langchain langchain-community sentence-transformers faiss-cpu transformers torch

import kagglehub
import pandas as pd
import os
import zipfile
from langchain.schema import Document

def download_dataset():
    """
    Download the Simple Dialogs for Chatbot dataset using kagglehub and handle potential zip files.

    Returns:
        str: Path to the dialogs.txt file.
    """
    try:
        # Download the latest version of the dataset
        dataset_path = kagglehub.dataset_download("grafstor/simple-dialogs-for-chatbot")
        print("Path to dataset files:", dataset_path)

        # List all files in the dataset directory
        print("Files in dataset directory:")
        for root, _, files in os.walk(dataset_path):
            for file in files:
                print(f" - {file}")
                # Check for zip files and extract them
                if file.endswith('.zip'):
                    zip_path = os.path.join(root, file)
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(root)
                        print(f"Extracted {zip_path}")

        # Search for dialogs.txt (case-insensitive)
        for root, _, files in os.walk(dataset_path):
            for file in files:
                if file.lower() == "dialogs.txt":
                    return os.path.join(root, file)

        raise FileNotFoundError("dialogs.txt not found in the downloaded dataset. Check listed files above.")

    except Exception as e:
        print(f"Error downloading dataset: {e}")
        raise

def load_dataset(file_path: str) -> list[Document]:
    """
    Load the Simple Dialogs for Chatbot TXT dataset and convert it to LangChain Document format.

    Args:
        file_path (str): Path to the TXT file.

    Returns:
        list[Document]: List of LangChain Document objects.
    """
    try:
        # Read the text file
        documents = []
        with open(file_path, 'r', encoding='utf-8') as file:
            # Assuming tab-separated format: Input\tResponse
            for line in file:
                # Skip empty lines
                if not line.strip():
                    continue
                # Split line into question and response (adjust delimiter if needed)
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    question, response = parts[0], parts[1]
                    if question and response:  # Ensure neither is empty
                        documents.append(
                            Document(
                                page_content=str(response),
                                metadata={"question": str(question)}
                            )
                        )

        if not documents:
            raise ValueError("No valid question-response pairs found in dialogs.txt")

        return documents

    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise

# Download and load dataset
try:
    txt_path = download_dataset()
    documents = load_dataset(txt_path)

    # Print first few documents to verify
    for doc in documents[:3]:
        print(f"Content: {doc.page_content[:100]}...")
        print(f"Metadata: {doc.metadata}\n")
except Exception as e:
    print(f"Error in main execution: {e}")

Path to dataset files: /kaggle/input/simple-dialogs-for-chatbot
Files in dataset directory:
 - dialogs.txt
Content: i'm fine. how about yourself?...
Metadata: {'question': 'hi, how are you doing?'}

Content: i'm pretty good. thanks for asking....
Metadata: {'question': "i'm fine. how about yourself?"}

Content: no problem. so how have you been?...
Metadata: {'question': "i'm pretty good. thanks for asking."}



In [None]:
!pip install kagglehub pandas langchain langchain-community sentence-transformers faiss-cpu transformers torch

import kagglehub
import pandas as pd
import os
import zipfile
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

def download_dataset():
    """
    Download the Simple Dialogs for Chatbot dataset using kagglehub and handle potential zip files.

    Returns:
        str: Path to the dialogs.txt file.
    """
    try:
        dataset_path = kagglehub.dataset_download("grafstor/simple-dialogs-for-chatbot")
        print("Path to dataset files:", dataset_path)

        print("Files in dataset directory:")
        for root, _, files in os.walk(dataset_path):
            for file in files:
                print(f" - {file}")
                if file.endswith('.zip'):
                    zip_path = os.path.join(root, file)
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(root)
                        print(f"Extracted {zip_path}")

        for root, _, files in os.walk(dataset_path):
            for file in files:
                if file.lower() == "dialogs.txt":
                    return os.path.join(root, file)

        raise FileNotFoundError("dialogs.txt not found in the downloaded dataset. Check listed files above.")

    except Exception as e:
        print(f"Error downloading dataset: {e}")
        raise

def load_dataset(file_path: str) -> list[Document]:
    """
    Load the Simple Dialogs for Chatbot TXT dataset and convert it to LangChain Document format.

    Args:
        file_path (str): Path to the TXT file.

    Returns:
        list[Document]: List of LangChain Document objects.
    """
    try:
        documents = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                if not line.strip():
                    continue
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    question, response = parts[0], parts[1]
                    if question and response:
                        documents.append(
                            Document(
                                page_content=str(response),
                                metadata={"question": str(question)}
                            )
                        )

        if not documents:
            raise ValueError("No valid question-response pairs found in dialogs.txt")

        return documents

    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise

def setup_rag_pipeline(documents):
    """
    Set up a RAG pipeline using LangChain with FAISS vector store and a Hugging Face LLM.

    Args:
        documents (list[Document]): List of LangChain Document objects.

    Returns:
        RetrievalQA: Configured RAG chain for question answering.
    """
    try:
        # Initialize embeddings
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

        # Create FAISS vector store from documents
        vector_store = FAISS.from_documents(documents, embeddings)

        # Initialize the retriever
        retriever = vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 documents

        # Set up the LLM (using Hugging Face's GPT-2 as an example; replace with a better model if needed)
        model_name = "gpt2"  # Consider using "distilgpt2" or another lightweight model for Colab
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        text_generation_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=50,
            truncation=True
        )
        llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

        # Set up the RAG pipeline
        rag_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",  # Use retrieved documents directly
            retriever=retriever,
            return_source_documents=True
        )

        return rag_chain

    except Exception as e:
        print(f"Error setting up RAG pipeline: {e}")
        raise

# Download and load dataset
try:
    txt_path = download_dataset()
    documents = load_dataset(txt_path)

    # Print first few documents to verify
    for doc in documents[:3]:
        print(f"Content: {doc.page_content[:100]}...")
        print(f"Metadata: {doc.metadata}\n")

    # Set up RAG pipeline
    rag_chain = setup_rag_pipeline(documents)

    # Test the RAG pipeline with a sample question
    sample_question = "How are you doing?"
    result = rag_chain({"query": sample_question})
    print(f"Question: {sample_question}")
    print(f"Answer: {result['result']}")
    print(f"Source Documents: {[doc.page_content for doc in result['source_documents']]}")

except Exception as e:
    print(f"Error in main execution: {e}")

Path to dataset files: /kaggle/input/simple-dialogs-for-chatbot
Files in dataset directory:
 - dialogs.txt
Content: i'm fine. how about yourself?...
Metadata: {'question': 'hi, how are you doing?'}

Content: i'm pretty good. thanks for asking....
Metadata: {'question': "i'm fine. how about yourself?"}

Content: no problem. so how have you been?...
Metadata: {'question': "i'm pretty good. thanks for asking."}



  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
  result = rag_chain({"query": sample_question})
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: How are you doing?
Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

how are you doing that?

i'm doing well. how about you?

what were you doing?

Question: How are you doing?
Helpful Answer: Good evening. Your mom is always here to give advice and advice as well as my father, but I have always thought it would be best if both of them could do what they do best while both of us were around. Asking help and guidance
Source Documents: ['how are you doing that?', "i'm doing well. how about you?", 'what were you doing?']


In [None]:
from huggingface_hub import login
login("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")

In [None]:
def run_chatbot(rag_chain):
    """
    Run an interactive chatbot using the RAG pipeline.

    Args:
        rag_chain: Configured RetrievalQA chain.
    """
    print("Chatbot is running. Type 'exit' to stop.")
    while True:
        question = input("Enter your question: ")
        if question.lower() == 'exit':
            print("Exiting chatbot.")
            break
        try:
            result = rag_chain({"query": question})
            print(f"Answer: {result['result']}")
            print("Source Documents:")
            for doc in result['source_documents']:
                print(f" - {doc.page_content} (Question: {doc.metadata['question']})")
        except Exception as e:
            print(f"Error answering question: {e}")

# Add to main execution block
try:
    txt_path = download_dataset()
    documents = load_dataset(txt_path)
    for doc in documents[:3]:
        print(f"Content: {doc.page_content[:100]}...")
        print(f"Metadata: {doc.metadata}\n")

    rag_chain = setup_rag_pipeline(documents)
    run_chatbot(rag_chain)
except Exception as e:
    print(f"Error in main execution: {e}")

Path to dataset files: /kaggle/input/simple-dialogs-for-chatbot
Files in dataset directory:
 - dialogs.txt
Content: i'm fine. how about yourself?...
Metadata: {'question': 'hi, how are you doing?'}

Content: i'm pretty good. thanks for asking....
Metadata: {'question': "i'm fine. how about yourself?"}

Content: no problem. so how have you been?...
Metadata: {'question': "i'm pretty good. thanks for asking."}



Device set to use cpu


Chatbot is running. Type 'exit' to stop.
Enter your question: hello bro wassup


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

i'm actually in school right now.

see you soon.

tell her i said hello.

Question: hello bro wassup
Helpful Answer: i did not use my first name in the conversation

In this post i'm using my first name and address. My third name is as opposed to my address and i used it on purpose to remind myself that i did not use your first name
Source Documents:
 - i'm actually in school right now. (Question: i've actually been pretty good. you?)
 - see you soon. (Question: i'll talk to you later.)
 - tell her i said hello. (Question: it's to my mom.)
Enter your question: hey how are you


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

i'm doing well. how about you?

i'm fine. how about yourself?

so how have you been lately?

Question: hey how are you
Helpful Answer: Well I was looking up the new version of the game called Fire Emblem. And here is what I found. I read about it on reddit a few days ago and it was a great read. First up, I said that I am sorry so please
Source Documents:
 - i'm doing well. how about you? (Question: how's it going?)
 - i'm fine. how about yourself? (Question: hi, how are you doing?)
 - so how have you been lately? (Question: never better, thanks.)
Enter your question: exit
Exiting chatbot.
