In [None]:
!pip install transformers datasets scikit-learn evaluate

In [3]:
import pandas as pd

In [None]:
df = pd.read_csv('/kaggle/input/dataset/train_dataset.csv')
df

Unnamed: 0,text,status,from,translated
0,oh my gosh,Anxiety,df1,โอ้พระเจ้า!
1,trouble sleeping confused mind restless heart ...,Anxiety,df1,นอนไม่หลับ วุ่นวายใจ กระวนกระวายใจ ทุกอย่างดูผ...
2,all wrong back off dear forward doubt stay in ...,Anxiety,df1,ทุกอย่างไม่ถูกต้อง ถอยไปเถอะ อย่าก้าวไปข้างหน้...
3,i have shifted my focus to something else but ...,Anxiety,df1,ฉันพยายามเบนความสนใจไปเรื่องอื่นแล้ว แต่ก็ยังก...
4,i am restless and restless it is been a month ...,Anxiety,df1,ฉันกระวนกระวายใจมาก มันเป็นแบบนี้มาเป็นเดือนแล...
...,...,...,...,...
104995,low testosterone after discontinuing rispredon...,schizophrenia,df3,-
104996,how did you finally accept your diagnosis i am...,schizophrenia,df3,-
104997,constantly feel like i am in a competition wit...,schizophrenia,df3,-
104998,has anyone switched over to an entirely differ...,schizophrenia,df3,-


In [5]:
from sklearn.preprocessing import LabelEncoder

# Load and encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['status'])

# Get label names and mapping
label_names = label_encoder.classes_
num_labels = len(label_names)

# Optional: Check label mappings
label_mapping = dict(zip(label_names, range(num_labels)))
print(label_mapping)

{'Anxiety': 0, 'BPD': 1, 'Normal': 2, 'bipolar': 3, 'depression': 4, 'mentalillness': 5, 'schizophrenia': 6}


In [6]:
from datasets import Dataset

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['text', 'label']])
dataset = dataset.train_test_split(test_size=0.2)  # Split into train/test

In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"  # Use bart-base for faster fine-tuning than bart-large-mnli
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [9]:
from transformers import TrainingArguments, Trainer
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.006,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [10]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
500,1.1652,1.039024,0.643619
1000,1.0043,0.981171,0.667667
1500,0.9145,0.934789,0.674667
2000,0.8842,0.915524,0.682952
2500,0.9799,0.880412,0.69619
3000,0.6777,0.884442,0.694952
3500,0.7865,0.862642,0.701476
4000,0.8184,0.846433,0.707905
4500,0.7785,0.847287,0.703667
5000,0.805,0.827995,0.71219




TrainOutput(global_step=15750, training_loss=0.6872419414823018, metrics={'train_runtime': 4639.6023, 'train_samples_per_second': 54.315, 'train_steps_per_second': 3.395, 'total_flos': 8346190261248000.0, 'train_loss': 0.6872419414823018, 'epoch': 3.0})

In [11]:
from sklearn.metrics import classification_report

predictions = trainer.predict(tokenized_datasets["test"])  # or ["validation"]
preds = predictions.predictions.argmax(axis=-1)
labels = predictions.label_ids



In [13]:
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.76      0.79      0.77      3076
           1       0.70      0.69      0.70      3016
           2       0.91      0.91      0.91      2952
           3       0.72      0.69      0.70      2951
           4       0.73      0.73      0.73      3063
           5       0.53      0.52      0.52      3013
           6       0.75      0.78      0.76      2929

    accuracy                           0.73     21000
   macro avg       0.73      0.73      0.73     21000
weighted avg       0.73      0.73      0.73     21000



In [14]:
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')

In [15]:
!zip -r my_model.zip my_model

  adding: my_model/ (stored 0%)
  adding: my_model/tokenizer.json (deflated 71%)
  adding: my_model/config.json (deflated 53%)
  adding: my_model/special_tokens_map.json (deflated 42%)
  adding: my_model/vocab.txt (deflated 53%)
  adding: my_model/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: my_model/tokenizer_config.json (deflated 75%)


In [18]:
from IPython.display import FileLink
FileLink('my_model.zip')

In [22]:
import shutil

shutil.make_archive('my_model', 'zip', '/kaggle/working/my_model')

'/kaggle/working/my_model.zip'

In [23]:
from IPython.display import FileLink
FileLink(r'my_model.zip')