In [1]:
!pip install transformers datasets scikit-learn evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 req

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('/kaggle/input/train-dataset2-csv/train_dataset2.csv')
df

Unnamed: 0,text,status
0,oh my gosh,Anxiety
1,trouble sleeping confused mind restless heart ...,Anxiety
2,all wrong back off dear forward doubt stay in ...,Anxiety
3,i have shifted my focus to something else but ...,Anxiety
4,i am restless and restless it is been a month ...,Anxiety
...,...,...
89995,tw strong arm abuse my dad was screaming at me...,Stress
89996,hi i cannot think clearly today i know i have ...,Stress
89997,my chest give a dissimilar aroma before it wou...,Stress
89998,he was going to choke the name out of me i am ...,Stress


In [5]:
from sklearn.preprocessing import LabelEncoder

# Load and encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['status'])

# Get label names and mapping
label_names = label_encoder.classes_
num_labels = len(label_names)

# Optional: Check label mappings
label_mapping = dict(zip(label_names, range(num_labels)))
print(label_mapping)

{'Anxiety': 0, 'BPD': 1, 'Normal': 2, 'Stress': 3, 'Suicidal': 4, 'bipolar': 5, 'depression': 6, 'mentalillness': 7, 'schizophrenia': 8}


In [6]:
from datasets import Dataset

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['text', 'label']])
dataset = dataset.train_test_split(test_size=0.2)  # Split into train/test

In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"  # Use bart-base for faster fine-tuning than bart-large-mnli
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments, Trainer
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.006,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
500,1.2872,1.174998,0.609278
1000,1.2658,1.09109,0.634889
1500,0.9299,1.039384,0.649389
2000,1.2148,1.001145,0.662
2500,1.0461,0.960062,0.675056
3000,0.8929,0.946662,0.679611
3500,1.0378,0.942225,0.679389
4000,1.0177,0.925471,0.678722
4500,0.9192,0.909515,0.688
5000,0.7894,0.928519,0.689833




TrainOutput(global_step=13500, training_loss=0.7952181755348489, metrics={'train_runtime': 3755.7947, 'train_samples_per_second': 57.511, 'train_steps_per_second': 3.594, 'total_flos': 7154132502528000.0, 'train_loss': 0.7952181755348489, 'epoch': 3.0})

In [14]:
from sklearn.metrics import classification_report

predictions = trainer.predict(tokenized_datasets["test"])  # or ["validation"]
preds = predictions.predictions.argmax(axis=-1)
labels = predictions.label_ids



In [15]:
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2102
           1       0.67      0.69      0.68      1999
           2       0.88      0.89      0.88      1995
           3       0.90      0.89      0.89      1998
           4       0.66      0.65      0.66      2003
           5       0.69      0.70      0.69      1980
           6       0.57      0.53      0.55      2032
           7       0.47      0.48      0.47      1937
           8       0.71      0.71      0.71      1954

    accuracy                           0.70     18000
   macro avg       0.70      0.70      0.70     18000
weighted avg       0.70      0.70      0.70     18000



In [16]:
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')

In [17]:
!zip -r my_model.zip my_model

  adding: my_model/ (stored 0%)
  adding: my_model/config.json (deflated 55%)
  adding: my_model/tokenizer_config.json (deflated 75%)
  adding: my_model/vocab.txt (deflated 53%)
  adding: my_model/special_tokens_map.json (deflated 42%)
  adding: my_model/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: my_model/tokenizer.json (deflated 71%)


In [18]:
import shutil

shutil.make_archive('my_model', 'zip', '/kaggle/working/my_model')

'/kaggle/working/my_model.zip'

In [19]:
from IPython.display import FileLink
FileLink(r'my_model.zip')