**Virtual AI Assistant: Project**
using a basic chatbot implementation using the transformers library by Hugging Face# New Section

In [1]:
!pip install transformers
!pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

**Step 1: Import Libraries**

In [16]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset


**Step 2: Load and Preprocess Dataset**
For this example, let's use a dataset from Hugging Face's datasets library.

In [23]:
# Load the daily_dialog dataset
dataset = load_dataset("daily_dialog")

# Initialize the tokenizer and set the pad token
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the dataset to tokenize the dialog
def preprocess_data(examples):
    inputs = []
    attention_masks = []
    for dialog in examples['dialog']:
        dialog_input_ids = []
        dialog_attention_masks = []
        for utterance in dialog:
            tokenized = tokenizer(utterance, padding='max_length', truncation=True, max_length=128)
            dialog_input_ids.append(tokenized['input_ids'])
            dialog_attention_masks.append(tokenized['attention_mask'])
        # Append the tokenized data for the entire dialog as a single element
        inputs.append(dialog_input_ids)
        attention_masks.append(dialog_attention_masks)
    return {'input_ids': inputs, 'attention_mask': attention_masks}

tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=["dialog"])


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

**Step 3: Setup the Model**

In [24]:
# Load the model
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")


config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

**Step 4: Define a Simple Dialogue Flow**

In [25]:
# Function to generate a response
def generate_response(input_text):
    inputs = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors="pt")
    reply_ids = model.generate(inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(reply_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)


**Step 5: Test the Dialogue Flow**

# Test the chatbot
print("Chatbot: Hello! How can I assist you today?")
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit", "bye"]:
        print("Chatbot: Goodbye!")
        break
    response = generate_response(user_input)
    print(f"Chatbot: {response}")


In [29]:
# Test the chatbot
print("Chatbot: Hello! How can I assist you today?")
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit", "bye"]:
        print("Chatbot: Goodbye!")
        break
    response = generate_response(user_input)
    print(f"Chatbot: {response}")


Chatbot: Hello! How can I assist you today?
You: fine
Chatbot: I'm not sure what you're trying to say.
You: exit
Chatbot: Goodbye!


**Putting It All Together**

In [28]:
# Step 1: Install necessary libraries
# !pip install transformers
# !pip install datasets

# Step 2: Import Libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

# Step 3: Load and Preprocess Dataset
# Load the daily_dialog dataset
dataset = load_dataset("daily_dialog")

# Initialize the tokenizer and set the pad token
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the dataset to tokenize the dialog
def preprocess_data(examples):
    inputs = []
    attention_masks = []
    for dialog in examples['dialog']:
        dialog_input_ids = []
        dialog_attention_masks = []
        for utterance in dialog:
            tokenized = tokenizer(utterance, padding='max_length', truncation=True, max_length=128)
            dialog_input_ids.append(tokenized['input_ids'])
            dialog_attention_masks.append(tokenized['attention_mask'])
        # Append the tokenized data for the entire dialog as a single element
        inputs.append(dialog_input_ids)
        attention_masks.append(dialog_attention_masks)
    return {'input_ids': inputs, 'attention_mask': attention_masks}

tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=["dialog"])

# Step 4: Setup the Model
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

# Step 5: Define a Simple Dialogue Flow
def generate_response(input_text):
    inputs = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors="pt")
    reply_ids = model.generate(inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(reply_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)

# Step 6: Test the Dialogue Flow
print("Chatbot: Hello! How can I assist you today?")
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit", "bye"]:
        print("Chatbot: Goodbye!")
        break
    response = generate_response(user_input)
    print(f"Chatbot: {response}")


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Chatbot: Hello! How can I assist you today?
You: predict my name
Chatbot: I'm in
You: good
Chatbot: I'm not sure if you're serious or not but I'm not sure if you're serious.
You: bye
Chatbot: Goodbye!
