In [None]:
import pandas as pd
df = pd.read_csv("faq_data.csv")
df

Unnamed: 0,question,button
0,ذخیره ارزش چیست,گردش حساب
1,آیا محاسبه سود صندوق به صورت روز شمار است؟,پروفایل کاربری
2,مفهوم FIFO در صندوق لوتوس پارسیان چیست؟,تماس با ما
3,منظور از ابطال چیست؟,ثبت نام حقیقی
4,در یک روز، سقف ابطال در صندوق چه‌قدر است؟,سود
...,...,...
369,روش صدور در صندوق لوتوس چگونه است؟,صدور
370,چجوری پول بریزم تو صندوق ؟,صدور
371,راه افزایش سرمایه,صدور
372,به چه نحوی دارایی ام را افزایش دهم؟,صدور


In [None]:
print(df['button'].value_counts())

button
صدور              96
ثبت نام حقیقی     58
دارایی            57
ابطال             43
سود               28
پروفایل کاربری    25
تماس با ما        25
تمکن              23
گردش حساب         19
Name: count, dtype: int64


In [None]:
# install hazm library without its dependencies
!pip install hazm



In [None]:
from hazm import Normalizer, word_tokenize

norm = Normalizer()

def normalize(text):
    text = norm.normalize(text)
    return text

df["question"] = df["question"].apply(normalize)

In [None]:
labels = sorted(df["button"].unique())
label2id = {lbl:i for i, lbl in enumerate(labels)}
id2label = {i:lbl for lbl, i in label2id.items()}
df["label"] = df["button"].map(label2id)


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df, test_size=0.15, stratify=df["label"], random_state=42)
train_df, val_df = train_test_split(
    train_df, test_size=0.10, stratify=train_df["label"], random_state=42)


In [None]:
!pip install datasets



In [None]:
from datasets import Dataset, disable_caching
disable_caching()

train_ds = Dataset.from_pandas(train_df[["question", "label"]])
val_ds   = Dataset.from_pandas(val_df[["question", "label"]])
test_ds  = Dataset.from_pandas(test_df[["question", "label"]])


In [None]:
model_ckpt = "HooshvareLab/bert-base-parsbert-uncased"

In [None]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tok(
        batch["question"],
        truncation=True,
        padding="max_length",
        max_length=96,
    )

train_ds = train_ds.map(tokenize, batched=True, remove_columns=["question"])
val_ds   = val_ds.map(tokenize,   batched=True, remove_columns=["question"])
test_ds  = test_ds.map(tokenize,  batched=True, remove_columns=["question"])


Map:   0%|          | 0/285 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

In [None]:
!pip install -U numpy

Collecting numpy
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.3
    Uninstalling numpy-1.24.3:
      Successfully uninstalled numpy-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.5 which is incompatible.
hazm 0.10.0 requires numpy==1.24.3, but you have numpy 2.2.5 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.5


In [None]:
!pip install -U transformers

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

args = TrainingArguments(
    output_dir="parsbert-faq",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    seed=42,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install evaluate

In [None]:
import evaluate, numpy as np
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric_f1.compute(predictions=preds, references=labels,
                             average="macro")


Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmsghik[0m ([33mmsghik-Apple[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1
1,2.0586,1.831032,0.225443
2,1.5859,1.480556,0.351852
3,1.2091,1.268143,0.430159
4,0.9638,1.155186,0.483899
5,0.825,1.132024,0.475485


TrainOutput(global_step=180, training_loss=1.3284811231825087, metrics={'train_runtime': 227.4582, 'train_samples_per_second': 6.265, 'train_steps_per_second': 0.791, 'total_flos': 70304403470400.0, 'train_loss': 1.3284811231825087, 'epoch': 5.0})

In [None]:
trainer.evaluate(test_ds, metric_key_prefix="test")

{'test_loss': 1.2053061723709106,
 'test_f1': 0.4903926482873852,
 'test_runtime': 0.4412,
 'test_samples_per_second': 129.194,
 'test_steps_per_second': 18.132,
 'epoch': 5.0}