In [1]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

Натренированный трансформер

In [2]:
my_model = GPT2LMHeadModel.from_pretrained("/content/chatbot_model")
my_model.config.pad_token_id = my_model.config.eos_token_id
my_tokenizer = GPT2Tokenizer.from_pretrained("/content/chatbot_tokenizer")

TF-IDF

In [3]:
df = load_dataset('alespalla/chatbot_instruction_prompts')['train'].to_pandas()
df['response'] = df['response'].str.lower()
df['prompt'] = df['prompt'].str.lower()
vector = TfidfVectorizer()
tfidf_matrix = vector.fit_transform(df['prompt'])

Модель для семантического сходства. <br>
В отличие от косинусного сходства, которое смотрит на частоту повторения слов,<br>
семантическое сходство сравнивает значения слов и контекст.

In [4]:
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [5]:
# подсчет семантического сходства
def get_semantic_similarity(text1, text2):
    embedding1 = semantic_model.encode(text1, convert_to_tensor=True)
    embedding2 = semantic_model.encode(text2, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embedding1, embedding2)
    return similarity.item()

In [6]:
def get_response_tf_idf(user_input):
    text = user_input.lower()
    text_vectorized = vector.transform([text])

    similarity = cosine_similarity(text_vectorized, tfidf_matrix).flatten()

    most_similar_index = similarity.argmax()

    return df['response'].iloc[most_similar_index]

In [7]:
def get_response_transformer(user_input):
    text = user_input.lower()
    input_ids = my_tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)
    attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        output = my_model.generate(input_ids, attention_mask=attention_mask, max_length=32, num_beams=5, no_repeat_ngram_size=2)
        final_output = my_tokenizer.decode(output[0], skip_special_tokens=True)
        final_output = final_output.replace(text, '').strip()
    return final_output

In [13]:
# выбираем лучший респонс
def get_better_response(user_input):
    transformer_response = get_response_transformer(user_input)
    tfidf_response = get_response_tf_idf(user_input)

    semantic_similarity_transformer = get_semantic_similarity(user_input, transformer_response)
    semantic_similarity_tfidf = get_semantic_similarity(user_input, tfidf_response)

    if semantic_similarity_transformer > semantic_similarity_tfidf:
        print('transformer: ', end='')
        return transformer_response
    else:
        print('tf-idf: ', end='')
        return tfidf_response

In [14]:
print('\033[92m'+'ChatBot: '+'\033[0m', end='')
print('To end the dialogue, print "bye"')
while True:
    user_input = input('\033[91m'+'User: '+'\033[0m')
    if user_input.lower() == 'bye':
        print('\033[92m'+'ChatBot: '+'\033[0m', end='')
        print('Goodbye!')
        break
    print('\033[92m'+'ChatBot: '+'\033[0m', end='')
    print(get_better_response(user_input))

[92mChatBot: [0mTo end the dialogue, print "bye"
[91mUser: [0mwhat are your hobbies?
[92mChatBot: [0mtransformer: I like to play video games, read books and listen to classical music.
[91mUser: [0mwhere do you work?
[92mChatBot: [0mtransformer: I work in a restaurant.
[91mUser: [0mhow can i cook a pizza?
[92mChatBot: [0mtf-idf: step 1: preheat the oven to 375°f (190°c).
step 2: spread a thin layer of tomato sauce on the pizza dough.
step 3: sprinkle your favorite cheese on top of the sauce.
step 4: add toppings of your choice—veggies, pepperoni, sausage, etc. 
step 5: place the pizza in the preheated oven.
step 6: bake for 15-20 minutes or until the cheese is melted and the crust is golden brown. 
step 7: allow the pizza to cool slightly before serving. enjoy.
[91mUser: [0mhow can i stay healthy in winter?
[92mChatBot: [0mtf-idf: 1. make sure to dress warmly in layers when going out in cold weather.
2. avoid going outdoors when the temperature drops significantly.
3. 

Сначала выводит, каким методом был получен ответ, потом сам ответ.