<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/Chatbot/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade torch torchvision transformers


In [None]:
import pandas as pd
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
import torch
import faiss

# Load data from Excel sheets
hotels_df = pd.read_excel('/content/preprocessed_hotel_data.xlsx')
activities_df = pd.read_excel('/content/processed_data_Attractions_Colombo.xlsx')
restaurants_df = pd.read_excel('/content/preprocessed_colombo_restaurant.xlsx')

# Combine dataframes for simplicity (assuming they have similar structures)
data_df = pd.concat([hotels_df, activities_df, restaurants_df], ignore_index=True)

# Preprocess data
# For simplicity, let's assume the data has columns: 'name', 'description', 'category'
data_df['description'] = data_df['description'].fillna('')
data_df['category'] = data_df['category'].fillna('unknown')

# Convert descriptions to a list of documents
documents = data_df['description'].tolist()


In [None]:
# Initialize RAG tokenizer and retriever
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", indexed_dataset=documents)

# Initialize RAG model
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)


In [None]:
from transformers import Trainer, TrainingArguments

# Prepare data for training
class RagDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Tokenize descriptions
train_encodings = tokenizer(documents, truncation=True, padding=True)

# Prepare labels (assuming binary classification for simplicity)
labels = [1 if category == 'hotel' else 0 for category in data_df['category']]

# Create dataset
dataset = RagDataset(train_encodings, labels)
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)


In [None]:
# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f'Evaluation Results: {eval_results}')


In [None]:
# Example prediction
sample_description = "A luxurious 5-star hotel with a beautiful view"
inputs = tokenizer(sample_description, return_tensors='pt')

# Generate response
with torch.no_grad():
    outputs = model.generate(**inputs)

# Decode the generated response
generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f'Generated Response: {generated_response}')
