In [None]:
!pip install transformers torch pandas nltk rouge-score tqdm

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c00509a11c05f45af2324e53e49812cdac0fb745b186b9c760f05d7a9599520a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    GPT2Config
)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import logging
import time
from collections import defaultdict

In [None]:
class ConversationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.conversations = self.process_conversations(data)
        self.max_length = max_length

    def process_conversations(self, data):
        processed = []
        current_conv = []
        current_id = None

        # Sort by conversation_id to group messages
        data = data.sort_values(['conversation_id'])

        for _, row in data.iterrows():
            if current_id != row['conversation_id']:
                if current_conv:
                    processed.append(self.format_conversation(current_conv))
                current_conv = []
                current_id = row['conversation_id']
            current_conv.append(row['message'])

        # Add last conversation
        if current_conv:
            processed.append(self.format_conversation(current_conv))

        return processed

    def format_conversation(self, messages):
        # Format conversation with special tokens
        formatted = ""
        for i, message in enumerate(messages):
            if i % 2 == 0:
                formatted += f"<|user|>{message}<|endoftext|>"
            else:
                formatted += f"<|assistant|>{message}<|endoftext|>"
        return formatted

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        conversation = self.conversations[idx]

        # Encode the conversation
        encodings = self.tokenizer(
            conversation,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': encodings['input_ids'].squeeze()
        }

In [None]:
class TopicalChatbot(nn.Module):
    def __init__(self, model_name='gpt2'):
        super(TopicalChatbot, self).__init__()
        self.config = GPT2Config.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        # Add special tokens
        self.special_tokens = {
            'additional_special_tokens': ['<|user|>', '<|assistant|>', '<|endoftext|>']
        }

    def forward(self, input_ids, attention_mask=None, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

In [None]:

class ChatbotMetrics:
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.reset_metrics()

    def reset_metrics(self):
        self.metrics = {
            'response_accuracy': [],
            'bleu_scores': [],
            'rouge_scores': defaultdict(list),
            'response_time': [],
            'completion_rate': 0,
            'fallback_rate': 0,
            'total_conversations': 0,
            'successful_conversations': 0
        }

    def update_metrics(self, predicted_response, actual_response, response_time):
        # BLEU score
        bleu = sentence_bleu([actual_response.split()], predicted_response.split())
        self.metrics['bleu_scores'].append(bleu)

        # ROUGE scores
        rouge_scores = self.rouge_scorer.score(predicted_response, actual_response)
        for key, score in rouge_scores.items():
            self.metrics['rouge_scores'][key].append(score.fmeasure)

        # Response time
        self.metrics['response_time'].append(response_time)

        # Update conversation counts
        self.metrics['total_conversations'] += 1
        if bleu > 0.1:  # Simple threshold for "successful" response
            self.metrics['successful_conversations'] += 1

    def get_metrics_summary(self):
        completion_rate = (self.metrics['successful_conversations'] /
                         max(1, self.metrics['total_conversations']))

        return {
            'avg_bleu': np.mean(self.metrics['bleu_scores']),
            'avg_rouge1': np.mean(self.metrics['rouge_scores']['rouge1']),
            'avg_rouge2': np.mean(self.metrics['rouge_scores']['rouge2']),
            'avg_rougeL': np.mean(self.metrics['rouge_scores']['rougeL']),
            'avg_response_time': np.mean(self.metrics['response_time']),
            'completion_rate': completion_rate,
            'fallback_rate': 1 - completion_rate
        }

def train_chatbot(model, train_loader, val_loader, tokenizer, device,
                 num_epochs=3, learning_rate=2e-5, max_grad_norm=1.0):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    metrics = ChatbotMetrics()
    best_val_loss = float('inf')

    # Create scheduler
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        # Training loop
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            start_time = time.time()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            response_time = time.time() - start_time

            loss = outputs.loss
            total_train_loss += loss.item()

            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()

            # Update metrics
            if 'logits' in outputs:
                predicted_tokens = torch.argmax(outputs.logits, dim=-1)
                predicted_text = tokenizer.decode(predicted_tokens[0])
                actual_text = tokenizer.decode(labels[0])
                metrics.update_metrics(predicted_text, actual_text, response_time)

            # Update progress bar
            progress_bar.set_postfix({'train_loss': loss.item()})

        # Validation loop
        model.eval()
        total_val_loss = 0
        val_metrics = ChatbotMetrics()

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                start_time = time.time()
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                response_time = time.time() - start_time

                loss = outputs.loss
                total_val_loss += loss.item()

                # Update validation metrics
                if 'logits' in outputs:
                    predicted_tokens = torch.argmax(outputs.logits, dim=-1)
                    predicted_text = tokenizer.decode(predicted_tokens[0])
                    actual_text = tokenizer.decode(labels[0])
                    val_metrics.update_metrics(predicted_text, actual_text, response_time)

        # Print epoch summary
        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        train_metrics = metrics.get_metrics_summary()
        validation_metrics = val_metrics.get_metrics_summary()

        print(f'\nEpoch {epoch+1} Summary:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')
        print('\nTraining Metrics:')
        for metric, value in train_metrics.items():
            print(f'{metric}: {value:.4f}')
        print('\nValidation Metrics:')
        for metric, value in validation_metrics.items():
            print(f'{metric}: {value:.4f}')

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': avg_val_loss,
                'metrics': validation_metrics
            }, 'best_topical_chatbot.pth')

    return metrics

def setup_and_train():
    # Load data
    print("Loading dataset...")
    df = pd.read_csv('/content/topical_chat.csv')

    # Split data
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Initialize tokenizer and model
    print("Initializing model and tokenizer...")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    model = TopicalChatbot()

    # Add special tokens
    tokenizer.add_special_tokens(model.special_tokens)
    model.model.resize_token_embeddings(len(tokenizer))

    # Create datasets
    train_dataset = ConversationDataset(train_df, tokenizer)
    val_dataset = ConversationDataset(val_df, tokenizer)
    test_dataset = ConversationDataset(test_df, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4)
    test_loader = DataLoader(test_dataset, batch_size=4)

    # Training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    print("\nStarting training...")
    metrics = train_chatbot(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        tokenizer=tokenizer,
        device=device
    )

    return model, tokenizer, test_loader, metrics

def generate_response(model, tokenizer, user_input, max_length=100):
    # Format input
    formatted_input = f"<|user|>{user_input}<|endoftext|><|assistant|>"

    # Encode input
    input_ids = tokenizer.encode(formatted_input, return_tensors='pt')

    # Generate response
    start_time = time.time()
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.encode('<|endoftext|>')[0]
    )
    response_time = time.time() - start_time

    # Decode response
    response = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    return response, response_time



In [None]:
# Train the model
model, tokenizer, test_loader, metrics = setup_and_train()



Loading dataset...
Initializing model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`



Starting training...


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 1/3: 100%|██████████| 2157/2157 [25:11<00:00,  1.43it/s, train_loss=1.88]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Validation: 100%|██████████| 2090/2090 [09:21<00:00,  3.72it/s]



Epoch 1 Summary:
Average training loss: 2.5133
Average validation loss: 0.9349

Training Metrics:
avg_bleu: 0.0326
avg_rouge1: 0.6769
avg_rouge2: 0.3964
avg_rougeL: 0.5300
avg_response_time: 0.0126
completion_rate: 0.0130
fallback_rate: 0.9870

Validation Metrics:
avg_bleu: 0.0213
avg_rouge1: 0.8707
avg_rouge2: 0.8142
avg_rougeL: 0.8478
avg_response_time: 0.0096
completion_rate: 0.0474
fallback_rate: 0.9526


Epoch 2/3: 100%|██████████| 2157/2157 [25:06<00:00,  1.43it/s, train_loss=2.52]
Validation: 100%|██████████| 2090/2090 [09:20<00:00,  3.73it/s]



Epoch 2 Summary:
Average training loss: 2.3200
Average validation loss: 1.0368

Training Metrics:
avg_bleu: 0.0414
avg_rouge1: 0.6861
avg_rouge2: 0.4041
avg_rougeL: 0.5372
avg_response_time: 0.0122
completion_rate: 0.0389
fallback_rate: 0.9611

Validation Metrics:
avg_bleu: 0.0250
avg_rouge1: 0.8628
avg_rouge2: 0.8062
avg_rougeL: 0.8404
avg_response_time: 0.0092
completion_rate: 0.0732
fallback_rate: 0.9268


Epoch 3/3: 100%|██████████| 2157/2157 [25:07<00:00,  1.43it/s, train_loss=2.19]
Validation: 100%|██████████| 2090/2090 [09:20<00:00,  3.73it/s]


Epoch 3 Summary:
Average training loss: 2.2731
Average validation loss: 0.9782

Training Metrics:
avg_bleu: 0.0464
avg_rouge1: 0.6911
avg_rouge2: 0.4093
avg_rougeL: 0.5424
avg_response_time: 0.0120
completion_rate: 0.0626
fallback_rate: 0.9374

Validation Metrics:
avg_bleu: 0.0279
avg_rouge1: 0.8729
avg_rouge2: 0.8165
avg_rougeL: 0.8505
avg_response_time: 0.0092
completion_rate: 0.0885
fallback_rate: 0.9115





In [None]:
def generate_response(model, tokenizer, user_input, max_length=100):
    # Format input
    formatted_input = f"<|user|>{user_input}<|endoftext|><|assistant|>"

    # Encode input
    input_ids = tokenizer.encode(formatted_input, return_tensors='pt')

    # Ensure input tensor is on the same device as the model
    device = next(model.parameters()).device
    input_ids = input_ids.to(device)

    # Generate response
    start_time = time.time()
    output_sequences = model.model.generate(  # Use `model.model` to access GPT2's `generate`
        input_ids=input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.encode('<|endoftext|>')[0]
    )
    response_time = time.time() - start_time

    # Decode response
    response = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    return response, response_time


In [None]:
import torch
from transformers import GPT2Tokenizer

def load_chatbot(model_path, tokenizer_name='gpt2'):
    # Load the tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Initialize the model
    model = TopicalChatbot()

    # Add special tokens to the tokenizer
    tokenizer.add_special_tokens(model.special_tokens)

    # Resize token embeddings to accommodate special tokens
    model.model.resize_token_embeddings(len(tokenizer))

    # Load the saved model state
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['model_state_dict'])

    # Move the model to the appropriate device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    return model, tokenizer

def generate_response(model, tokenizer, user_input, max_length=100):
    # Format input
    formatted_input = f"<|user|>{user_input}<|endoftext|><|assistant|>"

    # Encode input
    input_ids = tokenizer.encode(formatted_input, return_tensors='pt')

    # Ensure input tensor is on the same device as the model
    device = next(model.parameters()).device
    input_ids = input_ids.to(device)

    # Generate response
    start_time = time.time()
    output_sequences = model.model.generate(  # Use `model.model` to access GPT2's `generate`
        input_ids=input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.encode('<|endoftext|>')[0]
    )
    response_time = time.time() - start_time

    # Decode response
    response = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    return response, response_time

def test_chatbot(model, tokenizer):
    model.eval()
    test_inputs = [
        "What do you think about artificial intelligence?",
        "Tell me about climate change.",
        "What's your favorite book?",
        "How does machine learning work?"
    ]

    print("\nTesting chatbot with example inputs:")
    for test_input in test_inputs:
        response, response_time = generate_response(model, tokenizer, test_input)
        print(f"\nUser: {test_input}")
        print(f"Assistant: {response}")
        print(f"Response time: {response_time:.2f} seconds")

# Load the model and tokenizer
model_path = "best_topical_chatbot.pth"  # Path to the saved model checkpoint
model, tokenizer = load_chatbot(model_path)

# Run the test
test_chatbot(model, tokenizer)


  checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Testing chatbot with example inputs:

User: What do you think about artificial intelligence?
Assistant: What do you think about artificial intelligence? I think it's a good idea. I think it's a good idea to have a computer that can read and understand the language of the human race.
Response time: 0.54 seconds

User: Tell me about climate change.
Assistant: Tell me about climate change. I think it's a good idea. I think it's a good idea to have a backup plan.
Response time: 0.20 seconds

User: What's your favorite book?
Assistant: What's your favorite book? I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it's a good book. I think it
Response time: 0.77 seconds

User: How does machine learning work?
Assistant: How does machine learning work? I think it's a good idea to 