In [1]:
!pip install transformers datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
examples = [
    {
        "text": "### Instruction: I feel anxious all the time. What should I do?\n### Response: It's okay to feel anxious. Try breathing exercises, talk to someone you trust, or consult a mental health professional."
    },
    {
        "text": "### Instruction: I can't sleep due to stress.\n### Response: Try a relaxing routine before bed, avoid screens, and practice mindfulness or journaling."
    },
    {
        "text": "### Instruction: I'm afraid to talk to people because of social anxiety.\n### Response: You're not alone. Start small, be kind to yourself, and seek support if you need it."
    }
]

dataset = Dataset.from_list(examples)

def tokenize(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = tokens["input_ids"]
    return tokens


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
model_id = "sshleifer/tiny-gpt2"  # small, CPU-friendly
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id)

tokenized_data = dataset.map(tokenize)
training_args = TrainingArguments(
    output_dir="./mh-chatbot",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_steps=1,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


model.safetensors:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Step,Training Loss
1,10.5723
2,10.555
3,10.4944


TrainOutput(global_step=3, training_loss=10.540579160054525, metrics={'train_runtime': 1.797, 'train_samples_per_second': 1.669, 'train_steps_per_second': 1.669, 'total_flos': 350208.0, 'train_loss': 10.540579160054525, 'epoch': 1.0})

In [3]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "### Instruction: I've been feeling really down lately.\n### Response:"
response = pipe(prompt, max_new_tokens=50)[0]['generated_text']

print("🧠 Chatbot Response:\n", response)


Device set to use cpu


🧠 Chatbot Response:
 ### Instruction: I've been feeling really down lately.
### Response: PocketMini Television Tre Medic mutual representations predatorsMost linedGy Tre perhaps rubbing Medic Television perhaps Medic soy factors� lined workshops courtyard praying TreGy skillet representations praying boils Medic BooneMini 236 courtyardacious653Mostived representationsacious soy courtyard skillet membership praying Boone Boone deflect


In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "### Instruction: I feel really anxious and can't sleep well.\n### Response:"
response = pipe(prompt, max_new_tokens=60)[0]['generated_text']

print(response)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### Instruction: I feel really anxious and can't sleep well.
### Response: I'm not on the computer.
### Response: I feel weird going by random questions all the time. It feels like I don't understand what I'm doing.
## Response: Can you help me find the correct time? I'm running around the house with the time on my computer,


In [5]:

model.save_pretrained("mentalhealth-model")
tokenizer.save_pretrained("mentalhealth-model")


!zip -r mentalhealth-model.zip mentalhealth-model
from google.colab import files
files.download("mentalhealth-model.zip")


  adding: mentalhealth-model/ (stored 0%)
  adding: mentalhealth-model/tokenizer.json (deflated 82%)
  adding: mentalhealth-model/model.safetensors (deflated 7%)
  adding: mentalhealth-model/merges.txt (deflated 53%)
  adding: mentalhealth-model/config.json (deflated 51%)
  adding: mentalhealth-model/vocab.json (deflated 59%)
  adding: mentalhealth-model/generation_config.json (deflated 24%)
  adding: mentalhealth-model/special_tokens_map.json (deflated 52%)
  adding: mentalhealth-model/tokenizer_config.json (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>