<a href="https://colab.research.google.com/github/noir976/Quora-question-answer/blob/main/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#! pip install transformers datasets accelerate peft

In [None]:
import pandas as pd
from datasets import Dataset,load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [None]:
data = load_dataset("toughdata/quora-question-answer-dataset")
data = data["train"].train_test_split(test_size=0.2)
train = data["train"].to_pandas()
test = data["test"].to_pandas()

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# remove stop words from token list in each column
stop_words = set(stopwords.words('english'))
train['question'] = train['question'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))
train['answer'] = train['answer'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [None]:
# remove stop words from token list in each column
test['question'] = test['question'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))
test['answer'] = test['answer'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [None]:
# convert to lower case
train = train.map(lambda x: x.lower() if isinstance(x, str) else x)

# replace special characters (preserving only space)
train = train.map(lambda i: re.sub('[^a-z0-9]', ' ', i) if isinstance(i, str) else i)

# tokenizing columns
#train = train.map(lambda i: word_tokenize(i) if isinstance(i, str) else i)

In [None]:
# convert to lower case
test = test.map(lambda x: x.lower() if isinstance(x, str) else x)

# replace special characters (preserving only space)
test = test.map(lambda i: re.sub('[^a-z0-9]', ' ', i) if isinstance(i, str) else i)

# tokenizing columns
#test = test.map(lambda i: word_tokenize(i) if isinstance(i, str) else i)

In [None]:
train_data = Dataset.from_pandas(train)
test_data = Dataset.from_pandas(test)

In [None]:
model_id="google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
def preprocess_function(sample,padding="max_length"):
    model_inputs = tokenizer(sample["question"], max_length=256, padding=padding, truncation=True)
    labels = tokenizer(sample["answer"], max_length=256, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_tokenized_dataset = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)
test_tokenized_dataset = test_data.map(preprocess_function, batched=True, remove_columns=test_data.column_names)
print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")

In [None]:
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 787,868,672 || trainable%: 0.5989


In [None]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
output_dir="bot"
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    learning_rate=1e-3,
    num_train_epochs=2,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    report_to="tensorboard",
    push_to_hub = True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
)

In [None]:
model.config.use_cache = False

In [None]:
trainer.train()
trainer.model.save_pretrained("bot", push_to_hub=True)
tokenizer.save_pretrained("bot", push_to_hub=True)
trainer.model.base_model.save_pretrained("bot", push_to_hub=True)

In [None]:
! cp -r /content/bot/content/drive/MyDrive/Chatbot/

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "bot"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0}).cuda()
model.eval()

sample = "Mortal: \nExplain Artificial Sentience. \nImmortal: "
input_ids = tokenizer(sample, return_tensors="pt", truncation=True, max_length=256).input_ids.cuda()
outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_length=256)
print(f"{sample}")

print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

In [None]:
import streamlit as st
from streamlit_chat import message
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

st.title("Chatbot")
@st.cache_resource(show_spinner=True)
def load_model_tokenizer():
    peft_model_id = "bot"
    config = PeftConfig.from_pretrained(peft_model_id)

    model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

    model = PeftModel.from_pretrained(model, peft_model_id).to("cpu")
    model.eval()
    return model, tokenizer

model, tokenizer = load_model_tokenizer()

def inference(model, tokenizer, input_sent):
    input_ids = tokenizer(input_sent, return_tensors="pt", truncation=True, max_length=256).input_ids.to("cpu")
    outputs = model.generate(input_ids=input_ids, top_p=0.9, max_length=256)
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

message("Whats on your mind?", is_user=False)

placeholder = st.empty()
input_ = st.text_input("Mortal")

if st.button("Generate"):
    with placeholder.container():
        message(input_, is_user=True)
    input_ = "Mortal: " + input_ + ". Immortal: "
    with st.spinner(text="Generating Response.....  "):
        with placeholder.container():
            message(inference(model, tokenizer, input_), is_user=False)