In [1]:
import torch
import os
import glob
import pandas as pd
import numpy as np
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


MODEL_PATH = "./my_bert_model"
DATA_PATH = "dataset"
TEXT_COLUMN = "selftext"
LABEL_COLUMN = "subreddit"

def load_and_filter_dataset(data_path, text_column, label_column, s_size=None):

    allowed_labels = ['suicidewatch', 'depression', 'lonely', 'mentalhealth', 'anxiety']

    csv_files = glob.glob(os.path.join(data_path, "**", "*.csv"), recursive=True)
    if not csv_files:
        raise FileNotFoundError(f"هیچ فایل CSV در مسیر '{data_path}' پیدا نشد.")
    
    all_dataframes = [pd.read_csv(file) for file in csv_files]
    all_data = pd.concat(all_dataframes, ignore_index=True)
    
    clear_data = all_data.dropna(subset=[text_column, label_column]).copy()
    clear_data[text_column] = clear_data[text_column].astype(str)
    clear_data = clear_data[~clear_data[text_column].str.lower().isin(['[removed]', '[deleted]'])]
    clear_data = clear_data[clear_data[text_column].str.strip() != '']
    
    df_filtered = clear_data[clear_data[label_column].str.lower().isin(allowed_labels)].copy()

    df_filtered.loc[:, label_column] = df_filtered[label_column].str.lower()

    print(f"تعداد ردیف‌ها پس از فیلتر کردن بر اساس لیبل: {len(df_filtered)}")
    if df_filtered.empty:
        return df_filtered

    if s_size is not None:
        print(f"انجام نمونه‌برداری با s_size={s_size} برای هر کلاس...")
        df_sampled = df_filtered.groupby(label_column, group_keys=False).apply(lambda x: x.sample(min(len(x), s_size), random_state=42))
        df_sampled = df_sampled.reset_index(drop=True)
        print(f"تعداد ردیف‌ها پس از نمونه‌برداری: {len(df_sampled)}")
    else:
        df_sampled = df_filtered

    print("\nتوزیع نهایی داده‌ها بر اساس کلاس:")
    print(df_sampled[label_column].value_counts())
    
    return df_sampled

df = load_and_filter_dataset(DATA_PATH, text_column=TEXT_COLUMN, label_column=LABEL_COLUMN, s_size=8000)

  from .autonotebook import tqdm as notebook_tqdm


تعداد ردیف‌ها پس از فیلتر کردن بر اساس لیبل: 1611011
انجام نمونه‌برداری با s_size=8000 برای هر کلاس...


  df_sampled = df_filtered.groupby(label_column, group_keys=False).apply(lambda x: x.sample(min(len(x), s_size), random_state=42))


تعداد ردیف‌ها پس از نمونه‌برداری: 40000

توزیع نهایی داده‌ها بر اساس کلاس:
subreddit
anxiety         8000
depression      8000
lonely          8000
mentalhealth    8000
suicidewatch    8000
Name: count, dtype: int64


In [2]:
df.head()

Unnamed: 0.1,score,selftext,subreddit,title,Label,CAT 1,Unnamed: 0,author,created_utc,timestamp
0,1.0,"I was diagnosed,about 10 years back with sever...",anxiety,Severe anxiety,,,6824.0,dogheritage0,1656847000.0,2022-07-03 21:12:54
1,1.0,I don't want to wake up anyone but my whole lo...,anxiety,"Please Help me, I feel like something bad is g...",,,1820.0,the_beast69,1603334000.0,2020-10-22 13:26:43
2,1.0,This is my first time posting in this subreddi...,anxiety,Anxiety And Attachment Issues Ruining Relation...,,,6147.0,normalvibezonly,1638653000.0,2021-12-05 08:20:03
3,1.0,I started taking medication for my anxiety (Pr...,anxiety,"Anxiety meds making me depressed, what should ...",,,5053.0,Ghostinthemachinima,1604996000.0,2020-11-10 19:12:19
4,1.0,I survived a massive attack which lasted for t...,anxiety,I survived a massive attack,,,4349.0,horror_haller,1623566000.0,2021-06-13 16:37:01


In [2]:
df = load_and_filter_dataset(DATA_PATH, text_column=TEXT_COLUMN, label_column=LABEL_COLUMN, s_size=8000)

if not df.empty and 'title' in df.columns:
    print("در حال ترکیب ستون‌های 'title' و 'selftext'...")
    df['title'] = df['title'].astype(str)
    df[TEXT_COLUMN] = df['title'] + " [SEP] " + df[TEXT_COLUMN]
    print("ترکیب با موفقیت انجام شد.")

تعداد ردیف‌ها پس از فیلتر کردن بر اساس لیبل: 1611011
انجام نمونه‌برداری با s_size=8000 برای هر کلاس...


  df_sampled = df_filtered.groupby(label_column, group_keys=False).apply(lambda x: x.sample(min(len(x), s_size), random_state=42))


تعداد ردیف‌ها پس از نمونه‌برداری: 40000

توزیع نهایی داده‌ها بر اساس کلاس:
subreddit
anxiety         8000
depression      8000
lonely          8000
mentalhealth    8000
suicidewatch    8000
Name: count, dtype: int64
در حال ترکیب ستون‌های 'title' و 'selftext'...
ترکیب با موفقیت انجام شد.


In [3]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {k: v.to(model.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

0        False
1        False
2        False
3        False
4        False
         ...  
39996    False
39997    False
39998    False
39999    False
40000    False
Name: labels, Length: 40001, dtype: bool

In [4]:

le = LabelEncoder()
df['labels'] = le.fit_transform(df[LABEL_COLUMN])
num_labels = len(le.classes_)
id2label = {i: label for i, label in enumerate(le.classes_)}
label2id = {label: i for i, label in id2label.items()}
print(f"\nکلاس‌های شناسایی شده برای آموزش: {list(le.classes_)}")

if not df.empty:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['labels'])

    raw_datasets = DatasetDict({
        "train": Dataset.from_pandas(train_df),
        "eval": Dataset.from_pandas(eval_df)
    })

    def tokenize_function(examples):
        return tokenizer(examples[TEXT_COLUMN], truncation=True, max_length=256)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

    required_columns = ['input_ids', 'attention_mask', 'labels']
    all_columns = tokenized_datasets["train"].column_names
    columns_to_remove = [col for col in all_columns if col not in required_columns]
    if columns_to_remove:
        tokenized_datasets = tokenized_datasets.remove_columns(columns_to_remove)

    accuracy_metric = evaluate.load("accuracy")
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy_metric.compute(predictions=predictions, references=labels)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_PATH,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    model.to(device)

    training_args = TrainingArguments(
        output_dir='./results_filtered',
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        fp16=True,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["eval"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print("\nشروع فرآیند fine-tuning مدل...")
    trainer.train()
    print("\nآموزش با موفقیت به پایان رسید. ")
else:
    print("\nدیتافریم خالی است. فرآیند آموزش متوقف شد.")


کلاس‌های شناسایی شده برای آموزش: ['anxiety', 'depression', 'lonely', 'mentalhealth', 'suicidewatch']


Map: 100%|██████████████████████████████████████████████████████████████| 32000/32000 [00:17<00:00, 1809.97 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 8000/8000 [00:02<00:00, 2701.45 examples/s]
Some weights of the model checkpoint at ./my_bert_model were not used when initializing BertForSequenceClassification: ['fit_denses.6.weight', 'fit_denses.2.bias', 'cls.seq_relationship.weight', 'fit_denses.4.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'fit_denses.3.bias', 'fit_denses.2.weight', 'fit_denses.5.weight', 'cls.predictions.transform.LayerNorm.weight', 'fit_denses.0.bias', 'fit_denses.1.bias', 'cls.predictions.transform.dense.weight', 'fit_denses.6.bias', 'fit_denses.0.weight', 'cls.predictions.bias', 'fit_denses.3.weight', 'cls.predictions.transform.LayerNorm.bias', 'fit_denses.1.weight', 'fit_denses.4.weight', 'fit_denses.5.bias']
- This IS expected if you are initializing


شروع فرآیند fine-tuning مدل...


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3338,1.246762,0.47375
2,1.1176,1.215304,0.483375
3,1.0261,1.127699,0.57
4,0.9374,1.115006,0.583375
5,0.9479,1.101449,0.59225
6,0.8138,1.15117,0.583625
7,0.6958,1.204097,0.58725
8,0.7805,1.217763,0.584375


***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8
Saving model checkpoint to ./results_filtered\checkpoint-4000
Configuration saved in ./results_filtered\checkpoint-4000\config.json
Model weights saved in ./results_filtered\checkpoint-4000\pytorch_model.bin
tokenizer config file saved in ./results_filtered\checkpoint-4000\tokenizer_config.json
Special tokens file saved in ./results_filtered\checkpoint-4000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8
Saving model checkpoint to ./results_filtered\checkpoint-8000
Configuration saved in ./results_filtered\checkpoint-8000\config.json
Model weights saved in ./results_filtered\checkpoint-8000\pytorch_model.bin
tokenizer config file saved in ./results_filtered\checkpoint-8000\tokenizer_config.json
Special tokens file saved in ./results_filtered\checkpoint-8000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8
Saving model checkpoin


آموزش با موفقیت به پایان رسید. ✅
