In [1]:
%pip install transformers



In [22]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [23]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
class Chatbot:
    def __init__(self, model_name, tokenizer_name, data_file):
        self.model_name = model_name
        self.tokenizer_name = tokenizer_name
        self.data_file = data_file
        self.model = None
        self.tokenizer = None
        self.dataset = None
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def load_dataset(self):
        self.dataset = pd.read_csv(self.data_file)

    def preprocess_text(self, text):
        text = re.sub(r'[^\w\s]', '', text)
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in self.stop_words])
        text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split()])
        return text

    def load_model(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.tokenizer_name)
        self.model = GPT2LMHeadModel.from_pretrained(self.model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def train(self):
        self.load_dataset()
        self.load_model()

        inputs = self.dataset['question'].tolist()
        responses = self.dataset['answer'].tolist()

        input_tokens = self.tokenizer.batch_encode_plus([self.preprocess_text(text) for text in inputs], padding=True, truncation=True, return_tensors='pt')
        response_tokens = self.tokenizer.batch_encode_plus([self.preprocess_text(text) for text in responses], padding=True, truncation=True, return_tensors='pt')

        input_ids = input_tokens['input_ids']
        attention_mask = input_tokens['attention_mask']
        target_ids = response_tokens['input_ids']

        self.model.train()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)

        num_epochs = 5
        for epoch in range(num_epochs):
            total_loss = 0
            for i in range(len(input_ids)):
                input_batch = input_ids[i].unsqueeze(0).to(device)
                attention_mask_batch = attention_mask[i].unsqueeze(0).to(device)
                target_batch = target_ids[i].unsqueeze(0).to(device)

                optimizer.zero_grad()

                outputs = self.model(input_ids=input_batch, attention_mask=attention_mask_batch, labels=target_batch)
                loss = outputs.loss

                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            avg_loss = total_loss / len(input_ids)
            print(f"Epoch {epoch+1}: Average Loss = {avg_loss}")

    def respond(self, user_input):
        try:
            input_text = self.preprocess_text(user_input)
            input_ids = self.tokenizer.encode(input_text, return_tensors='pt')
            output_ids = self.model.generate(
                    input_ids,
                    max_length=300,
                    num_beams=5,
                    no_repeat_ngram_size=3,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    bos_token_id=self.tokenizer.bos_token_id,
                )

            response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
            return response
        except:
            return "I'm sorry, I don't have an answer for that."

In [14]:
if __name__ == '__main__':
    chatbot = Chatbot('microsoft/DialoGPT-medium', 'microsoft/DialoGPT-medium', 'chatbot.csv')
    chatbot.train()

Epoch 1: Average Loss = 2.392216528869115
Epoch 2: Average Loss = 1.9270132882583062
Epoch 3: Average Loss = 1.7246114112426771
Epoch 4: Average Loss = 1.4437235058654074
Epoch 5: Average Loss = 1.2070866096959818


In [28]:
print("Hello! I'm Бот. How can I help you?")

while True:
    user_input = input("Пользователь: ")

    if user_input.lower() == 'выход':
        break
    else:
        response = chatbot.respond(user_input)
        print(f"Бот: {response}")

Hello! I'm Бот. How can I help you?
Пользователь: What is the meaning of life?
Бот: I'm sorry, I don't have an answer for that.
Пользователь: выход
