In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from ipywidgets import widgets
from IPython.display import display, clear_output

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")


### Define chat function 

In [2]:
def chat(user_input, chat_history_ids=None):
    # Encoding the new user input
    new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt') 
    # Generate attention mask 
    attention_mask = torch.ones_like(new_user_input_ids)

    # Appending new user input to chat history
    if chat_history_ids is not None:
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
        # Update attention mask: 1 for all new user input ids, append 0 for the padding
        attention_mask = torch.cat([torch.ones_like(chat_history_ids), torch.ones_like(new_user_input_ids)], dim=-1)
    else:
        bot_input_ids = new_user_input_ids

    # Generating a response
    chat_history_ids = model.generate(
        bot_input_ids,
        attention_mask=attention_mask,  # Pass the attention mask here
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature=0.8
    )

    # Decode the last output (the model's response)
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    
    # For debugging: Print the history including the user's input and bot's response
    # Uncomment the line below if you want to see the entire token IDs sequence
    print(f"Encoded History: {chat_history_ids.tolist()}")
    print(f"Decoded History: {tokenizer.decode(chat_history_ids[0], skip_special_tokens=True)}")

    
    # Return the chat history and response
    return chat_history_ids, response

### Create interactive UI 

In [3]:
# Set up the interface elements
user_input = widgets.Text(value='', placeholder='Type something', description='You:', disabled=False)
send_button = widgets.Button(description="Send")
output = widgets.Output()
dialogue = None

# Define event when the Send button is clicked
def on_send_button_clicked(b):
    global dialogue
    with output:
        clear_output()
        # Output the user's message
        print(f"You: {user_input.value}")
        # Call the chat function with user input and existing chat history
        dialogue, bot_response = chat(user_input.value, dialogue)
        # Output the model's response
        print(f"DialoGPT: {bot_response}")
        # Clear the input for the next message
        user_input.value = ''

send_button.on_click(on_send_button_clicked)

# Display elements
display(user_input, send_button, output)

Text(value='', description='You:', placeholder='Type something')

Button(description='Send', style=ButtonStyle())

Output()

In [16]:
torch.ones_like(dialogue)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])

## Chatbot Context Truncation

Language models have a maximum sequence length they can process. For example, GPT-2's maximum context size is 1,024 tokens, and other models may vary. If we keep appending user inputs and bot responses to the conversation history indefinitely, we will eventually exceed the model's maximum input size.

To avoid exceeding the maximum length, we must truncate the conversation history by keeping only the most recent relevant parts of the conversation. Below is a code snippet providing an example of how to do this truncation, keeping the conversation within the model's limits.

In [17]:
def chat_w_truncation(user_input, chat_history_ids=None):
    # Tokenize new user input
    new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')

    # Combine new input with chat history (if available)
    if chat_history_ids is not None:
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
    else:
        bot_input_ids = new_user_input_ids
        
    # Define the maximum token length for the model (e.g., 1024 for GPT-2)
    max_length = model.config.n_ctx

    # Truncate the concatenated input to fit within the model's maximum input length
    if bot_input_ids.shape[-1] > max_length:
        bot_input_ids = bot_input_ids[:, -max_length:]
    
    # Generate attention mask corresponding to the input tokens, ignoring padded tokens
    attention_mask = torch.ones_like(bot_input_ids)

    # Generate response using model.generate
    chat_history_ids = model.generate(
        bot_input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id
        # Include other generate parameters as needed...
    )
    
    # Decode the generated response
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    
    return chat_history_ids, response


### Chatbot Old Context Summarization

To avoid losing potentially relevant information through truncation, we can summarize the older parts of the conversation history instead. Summarization compresses the extended dialogue into a shorter form, retaining the key points. This can be helpful for maintaining context without running into the token limit of the model.

Implementing summarization typically involves using a dedicated summarization model that condenses the older context into a shorter version and using this summarized context as a base for further generation. The summarization model should be capable of understanding the narrative of a dialogue and extracting the main topics or intents.