In [None]:
!pip install scikit-multilearn

In [None]:
import pandas as pd, numpy as np, joblib, re
# from hazm import Normalizer, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score, hamming_loss
from skmultilearn.model_selection import iterative_train_test_split   # multi-label stratified split


In [None]:
df = pd.read_csv("faq_data.csv")

In [None]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform([[lbl] for lbl in df["button"]])   # shape (N, #classes)

In [None]:
X = df["question"].values.reshape(-1, 1)   # needs 2-D for the splitter

X_train_val, y_train_val, X_test, y_test = iterative_train_test_split(
    X, y, test_size=0.15)

X_train, y_train, X_val, y_val = iterative_train_test_split(
    X_train_val, y_train_val, test_size=0.10)

# flatten X back to 1-D lists of strings
X_train, X_val, X_test = X_train.ravel(), X_val.ravel(), X_test.ravel()


In [None]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),        # uni & bi-grams
        min_df=2,
        sublinear_tf=True,
        norm="l2",
    )),
    ("clf", OneVsRestClassifier(
        LogisticRegression(
            max_iter=2500,
            class_weight="balanced",
            n_jobs=-1,
            solver="lbfgs",
        ),
        n_jobs=-1,
    )),
])


In [None]:
pipe.fit(X_train, y_train)

In [None]:
# predict probabilities for each label
probs_val = pipe.predict_proba(X_val)            # shape (rows, classes)

def evaluate(threshold):
    y_pred = (probs_val >= threshold).astype(int)
    return f1_score(y_val, y_pred, average="macro"), hamming_loss(y_val, y_pred)

for τ in np.arange(0.2, 0.6, 0.05):
    f1, ham = evaluate(τ)
    print(f"τ={τ:.2f}  macro-F1={f1:.3f}  hamming={ham:.3f}")


τ=0.20  macro-F1=0.273  hamming=0.497
τ=0.25  macro-F1=0.320  hamming=0.368
τ=0.30  macro-F1=0.402  hamming=0.250
τ=0.35  macro-F1=0.437  hamming=0.188
τ=0.40  macro-F1=0.479  hamming=0.146
τ=0.45  macro-F1=0.523  hamming=0.101
τ=0.50  macro-F1=0.474  hamming=0.101
τ=0.55  macro-F1=0.441  hamming=0.108


In [None]:
τ = 0.30                                  # example: plug-in best threshold
y_pred_test = (pipe.predict_proba(X_test) >= τ).astype(int)

print(classification_report(y_test, y_pred_test, target_names=mlb.classes_))
print("Hamming loss:", hamming_loss(y_test, y_pred_test))


                precision    recall  f1-score   support

         ابطال       0.19      0.83      0.31         6
    تماس با ما       0.12      0.25      0.17         4
          تمکن       0.11      0.33      0.17         3
 ثبت نام حقیقی       0.25      0.89      0.39         9
        دارایی       0.38      0.67      0.48         9
           سود       0.25      0.75      0.38         4
          صدور       0.36      0.86      0.51        14
پروفایل کاربری       0.18      0.75      0.29         4
     گردش حساب       0.20      0.67      0.31         3

     micro avg       0.25      0.73      0.37        56
     macro avg       0.23      0.67      0.33        56
  weighted avg       0.27      0.73      0.39        56
   samples avg       0.30      0.73      0.41        56

Hamming loss: 0.2718253968253968


In [None]:
def predict_labels(text, k=None, threshold=0.30):
    probs = pipe.predict_proba([text])[0]
    # choose by threshold …
    selected = [(lbl, p) for lbl, p in zip(mlb.classes_, probs) if p >= threshold]
    # … or by top-k if supplied
    if k is not None:
        topk_idx = np.argsort(probs)[::-1][:k]
        selected = [(mlb.classes_[i], probs[i]) for i in topk_idx]
    return sorted(selected, key=lambda x: x[1], reverse=True)

predict_labels("در یک روز سقف ابطال چه‌قدر است؟", k=3)
# ➜ [('ابطال', 0.81), ('دارایی', 0.13), ('صدور', 0.02)]

[('سود', np.float64(0.7823693874834365)),
 ('دارایی', np.float64(0.30419081549075117)),
 ('تماس با ما', np.float64(0.28444649888615314))]

In [None]:
# Example: char+word TF-IDF, OVR LinearSVC, per-label τ, oversampled data
from sklearn.pipeline import FeatureUnion
word = TfidfVectorizer(
        tokenizer=hazm_tokenizer, ngram_range=(1,3),
        min_df=2, sublinear_tf=True)
char = TfidfVectorizer(
        analyzer='char_wb', ngram_range=(3,6), min_df=3)
pipe = Pipeline([
        ('feats', FeatureUnion([('word', word), ('char', char)])),
        ('clf', OneVsRestClassifier(
                    LinearSVC(loss='squared_hinge', C=2.0), n_jobs=-1))
])
pipe.fit(X_train_res, y_train_res)        # after oversampling
probs_val = pipe.decision_function(X_val) # raw margins


NameError: name 'LinearSVC' is not defined