In [7]:
import sys
import os
from pathlib import Path
import pandas as pd
import json5
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)

import numpy as np
import evaluate

In [8]:
__file__ = "Untitled-1.ipynb"
project_root = Path(__file__).resolve().parents[1]
sys.path.append(str(project_root))
from utils.nlp_utils import tokenize, stem, bag_of_words
from utils.question_generate import generate_text

paths = {
    "labels": os.path.abspath(f"{project_root}/intents/labels.json"),
    "questions": os.path.abspath(f"{project_root}/intents/questions.csv"),
    "model": os.path.abspath(f"{project_root}/models/t5-base"),
    "tokenizer": os.path.abspath(f"{project_root}/models/tokenizer"),
}

os.makedirs(paths["model"], exist_ok=True)
os.makedirs(paths["tokenizer"], exist_ok=True)


In [9]:
spec = pd.read_csv(f"{os.path.abspath(project_root)}/data/storage/processed/final_cleaning.csv")
txt = json5.load(open(paths["labels"]))

In [10]:
df = generate_text(1000)

In [None]:
def preprocess_data(row):
    return {
        "input_text": f"{row['question']}",
        "target_text": f"""
            DISPLAY: {row['display']}; 
            RAM: {row['ram']}; 
            GPU: {row['gpu']}; 
            CPU: {row['cpu']}; 
            REFRESH_RATE: {row['refresh rate']}; 
            BRAND: {row['brand']}; 
            PRICE: {row['price']}
            """,
    }

processed = df.apply(preprocess_data, axis=1)

df_processed = pd.DataFrame(list(processed))
dataset = Dataset.from_pandas(df_processed)

model_checkpoint = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    return {
        "rouge1": rouge_output["rouge1"].mid.fmeasure,
        "rouge2": rouge_output["rouge2"].mid.fmeasure,
        "rougeL": rouge_output["rougeL"].mid.fmeasure,
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    do_eval=False,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=2,
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=93, training_loss=3.628292288831485, metrics={'train_runtime': 52.0566, 'train_samples_per_second': 57.63, 'train_steps_per_second': 1.787, 'total_flos': 396280794710016.0, 'train_loss': 3.628292288831485, 'epoch': 2.9206349206349205})

In [12]:
trainer.save_model(paths["model"])
tokenizer.save_pretrained(paths["tokenizer"])

('C:\\Users\\Onsra\\OneDrive - camann\\Documents\\GitHub - Repository\\Astorine\\src\\chatbot\\models\\tokenizer\\tokenizer_config.json',
 'C:\\Users\\Onsra\\OneDrive - camann\\Documents\\GitHub - Repository\\Astorine\\src\\chatbot\\models\\tokenizer\\special_tokens_map.json',
 'C:\\Users\\Onsra\\OneDrive - camann\\Documents\\GitHub - Repository\\Astorine\\src\\chatbot\\models\\tokenizer\\spiece.model',
 'C:\\Users\\Onsra\\OneDrive - camann\\Documents\\GitHub - Repository\\Astorine\\src\\chatbot\\models\\tokenizer\\added_tokens.json')

In [13]:
# import torch
# tokenizer = T5Tokenizer.from_pretrained("trained_tokenizer")
# model = T5ForConditionalGeneration.from_pretrained("trained_model")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# input_text = "Find a laptops have core i7 10th, ram 12gb, screen resolution quad hd, nvidia geforce gtx 1650 ti"
# inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
# inputs = {key: value.to(device) for key, value in inputs.items()}
# outputs = model.generate(**inputs, max_length=128)
# decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# decoded_output

In [14]:
# def gpu_mapping():
#     gpu_text = spec["GPU"].unique().tolist()
#     gpu_mapping = {}
#     for gpu in gpu_text.copy():
#         if gpu.startswith("Nvidia"):
#             text_1 = re.sub(r"^Nvidia\s+GeForce\s+", "", gpu).strip()
#             text_2 = re.sub(r"^Nvidia\s+", "", gpu).strip()
#             text_3 = re.sub(r"GeForce\s+", "", gpu).strip()
#         else:
#             text_1 = re.sub(r"^AMD\s+Radeon\s+", "", gpu).strip()
#             text_2 = re.sub(r"^AMD\s+", "", gpu).strip()
#             text_3 = re.sub(r"Radeon\s+", "", gpu).strip()
#         gpu_mapping[gpu] = [text_1.lower(), text_2.lower(), text_3.lower()]
#     return gpu_mapping

In [15]:
# def tokenize_function(examples):
#     model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True)
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(examples["target_text"], max_length=128, truncation=True)
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

In [16]:
# from sentence_transformers import SentenceTransformer, util

# model = SentenceTransformer("all-MiniLM-L6-v2")

# known_questions = ["What is Astorine?", "Tell me about Astorine", "Explain Astorine"]
# known_embeddings = model.encode(known_questions, convert_to_tensor=True)

# new_question = "Can you describe Astorine?"
# new_embedding = model.encode(new_question, convert_to_tensor=True)

# similarity = util.pytorch_cos_sim(new_embedding, known_embeddings)
# best_match = similarity.argmax().item()

# print(f"Câu gần nhất: {known_questions[best_match]}")

In [17]:
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# texts = [
#     "recommend me some laptops have RTX 3080, 32GB RAM, cpu Intel core i9 12th",
#     "give me some recommendations for laptops with RTX 4060 , 32GB RAM, cpu Intel core i7 13th",
#     "Find high-performance laptops that include RTX 3080 Ti, 64GB RAM, CPU Intel Core i9 12th.",
#     "Suggest powerful laptops with RTX 4070, 32GB RAM, CPU Intel Core i7 13th.",
# ]

# # Bag-of-Words
# vectorizer = CountVectorizer()
# bow_matrix = vectorizer.fit_transform(texts)
# print("BoW feature names:", vectorizer.get_feature_names_out())
# print("BoW matrix:\n", bow_matrix.toarray())

In [18]:
# # TF-IDF
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
# print("TF-IDF feature names:", tfidf_vectorizer.get_feature_names_out())
# print("TF-IDF matrix:\n", tfidf_matrix.toarray())

In [19]:
# import torch
# import os


In [20]:
# encoders_dir = "../../models/encoders"
# paths = {
#     "gpu": os.path.abspath(os.path.join(encoders_dir, "gpu.pkl")),
    
# }


# gpu_encoder = torch.load(paths["gpu"], map_location={"cuda:1": "cuda:0"}, weights_only=False)

In [21]:
# gpu_encoder.classes_