In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
class Chatbot:
    def __init__(self, model_name, tokenizer_name, data_file):
        self.model_name = model_name
        self.tokenizer_name = tokenizer_name
        self.data_file = data_file
        self.model = None
        self.tokenizer = None
        self.dataset = None
        self.vectorizer = None
        self.tfidf_matrix = None

    def load_dataset(self):
        self.dataset = pd.read_csv(self.data_file)
        self.dataset.fillna('', inplace=True)

    def load_model(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.tokenizer_name)
        self.model = GPT2LMHeadModel.from_pretrained(self.model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def train(self):
        self.load_dataset()
        self.load_model()

        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.dataset['question'])

        inputs = self.dataset['question'].tolist()
        responses = self.dataset['answer'].tolist()

        input_tokens = self.tokenizer.batch_encode_plus(inputs, padding=True, truncation=True, return_tensors='pt')
        response_tokens = self.tokenizer.batch_encode_plus(responses, padding=True, truncation=True, return_tensors='pt')

        input_ids = input_tokens['input_ids']
        attention_mask = input_tokens['attention_mask']
        target_ids = response_tokens['input_ids']

        self.model.train()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)

        for epoch in range(3):
            total_loss = 0
            for i in range(len(input_ids)):
                input_batch = input_ids[i].unsqueeze(0).to(device)
                attention_mask_batch = attention_mask[i].unsqueeze(0).to(device)
                target_batch = target_ids[i].unsqueeze(0).to(device)

                optimizer.zero_grad()

                outputs = self.model(input_ids=input_batch, attention_mask=attention_mask_batch, labels=target_batch)
                loss = outputs.loss

                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            print(f"Epoch {epoch+1}: Average Loss = {total_loss / len(input_ids)}")

    def find_match(self, user_input):
        user_tfidf = self.vectorizer.transform([user_input])
        cosine_similar = cosine_similarity(user_tfidf, self.tfidf_matrix)

        match_index = np.argmax(cosine_similar)
        max_similarity = cosine_similar[0, match_index]

        if max_similarity < 0.7:
            return None
        else:
            return self.dataset.loc[match_index, 'answer']

    def respond(self, user_input):
        match = self.find_match(user_input)
        if match:
            return match

        self.model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

        input_tokens = self.tokenizer.encode(user_input, add_special_tokens=True, return_tensors='pt').to_device(device)

        with torch.no_grad():
            output = self.model.generate(input_tokens, max_length=100, num_return_sequences=1)

        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return response

In [3]:
chatbot = Chatbot('microsoft/DialoGPT-medium', 'microsoft/DialoGPT-medium', 'chatbot.csv')
chatbot.train()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1: Average Loss = 2.3139398997422034
Epoch 2: Average Loss = 1.9176429616524868
Epoch 3: Average Loss = 1.4357763202518425


In [5]:
while True:
    user_input = input("Пользователь: ")

    if user_input.lower() == 'выход':
        break
    else:
        response = chatbot.respond(user_input)
        print("Бот: " + response)

Пользователь: What is your email?
Бот: it's bluedog123.
Пользователь: What is up?
Бот: nothing, how about you?
Пользователь: What do you like doing?
Бот: working.
Пользователь: выход
