In [None]:

# Mental Health Chatbot Training Notebook
import os
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import pytorch
from sklearn.model_selection import train_test_split

# Set device
# Force selection of the NVIDIA GPU (assumed as device 0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [2]:
# Load datasets from HF
print("Loading datasets...")

# Dataset 1: Mental Health FAQ CSV
try:
    mh_faq = load_dataset('csv', data_files='hf://datasets/tolu07/Mental_Health_FAQ/Mental_Health_FAQ.csv')
    print('MH FAQ dataset loaded')
except Exception as e:
    print(f"Error loading MH FAQ dataset: {e}")
    # Try alternative loading method
    try:
        mh_faq = load_dataset('tolu07/Mental_Health_FAQ')
        print('MH FAQ dataset loaded via direct HF path')
    except:
        print("Could not load MH FAQ dataset")
        mh_faq = None

Loading datasets...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 98 examples [00:00, 5242.48 examples/s]

MH FAQ dataset loaded





In [3]:
# Dataset 2: Mental Health Counseling Conversations
try:
    mh_counseling = load_dataset('Amod/mental_health_counseling_conversations')
    print('Mental Health Counseling Conversations loaded')
except Exception as e:
    print(f"Error loading Mental Health Counseling dataset: {e}")
    mh_counseling = None

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 3512/3512 [00:00<00:00, 219355.73 examples/s]

Mental Health Counseling Conversations loaded





In [4]:
# Dataset 3: AI Medical Chatbot Dialogues parquet
try:
    ai_medical = load_dataset('parquet', data_files='hf://datasets/ruslanmv/ai-medical-chatbot/dialogues.parquet')
    print('AI Medical Chatbot dialogues loaded')
except Exception as e:
    print(f"Error loading AI Medical dataset: {e}")
    # Try alternative loading method
    try:
        ai_medical = load_dataset('ruslanmv/ai-medical-chatbot')
        print('AI Medical dataset loaded via direct HF path')
    except:
        print("Could not load AI Medical dataset")
        ai_medical = None

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 256916 examples [00:00, 608711.21 examples/s]

AI Medical Chatbot dialogues loaded





In [5]:
# Dataset 4: ChatDoctor-HealthCareMagic (parquet)
try:
    chatdoctor = load_dataset('parquet', 
                             data_files='hf://datasets/lavita/ChatDoctor-HealthCareMagic-100k/data/train-00000-of-00001-5e7cb295b9cff0bf.parquet')
    print('ChatDoctor dataset loaded')
except Exception as e:
    print(f"Error loading ChatDoctor dataset: {e}")
    # Try alternative loading method
    try:
        chatdoctor = load_dataset('lavita/ChatDoctor-HealthCareMagic-100k')
        print('ChatDoctor dataset loaded via direct HF path')
    except:
        print("Could not load ChatDoctor dataset")
        chatdoctor = None

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 112165 examples [00:00, 499993.21 examples/s]

ChatDoctor dataset loaded





In [6]:
# Dataset 5: Mental Health Chatbot Dataset (parquet)
try:
    mh_chatbot = load_dataset('parquet', 
                             data_files='hf://datasets/heliosbrahma/mental_health_chatbot_dataset/data/train-00000-of-00001-01391a60ef5c00d9.parquet')
    print('Mental Health Chatbot dataset loaded')
except Exception as e:
    print(f"Error loading Mental Health Chatbot dataset: {e}")
    # Try alternative loading method
    try:
        mh_chatbot = load_dataset('heliosbrahma/mental_health_chatbot_dataset')
        print('Mental Health Chatbot dataset loaded via direct HF path')
    except:
        print("Could not load Mental Health Chatbot dataset")
        mh_chatbot = None

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 172 examples [00:00, 57301.06 examples/s]

Mental Health Chatbot dataset loaded





In [7]:
# Import csv file from local directory


In [8]:

# We'll define a function to convert each dataset's sample into a common conversation format
# Here we try to form a prompt in the style: 'User: <question>\nAssistant: <answer>'

def format_sample(example, source):
    # For FAQ data, assume question and answer columns
    if source == 'mh_faq':
        # Check for columns like 'question' and 'answer'
        if 'question' in example and 'answer' in example:
            return 'User: ' + example['question'] + '\nAssistant: ' + example['answer']
        elif 'Question' in example and 'Answer' in example:
            return 'User: ' + example['Question'] + '\nAssistant: ' + example['Answer']
        else:
            # Fallback: use entire row as text
            return str(example)
    elif source == 'mh_counseling':
        # For counseling conversations, assume a conversation text column
        if 'conversation' in example:
            return example['conversation']
        else:
            return str(example)
    elif source == 'ai_medical':
        # For medical dialogues, assume columns might be 'dialogue' or similar
        if 'dialogue' in example:
            return example['dialogue']
        elif 'text' in example:
            return example['text']
        else:
            return str(example)
    elif source == 'chatdoctor':
        # For ChatDoctor, assume conversation as text or QA pair
        if 'text' in example:
            return example['text']
        elif 'instruction' in example and 'output' in example:
            return 'User: ' + example['instruction'] + '\nAssistant: ' + example['output']
        else:
            return str(example)
    elif source == 'mh_chatbot':
        # Assume similar conversation text
        if 'text' in example:
            return example['text']
        elif 'conversation' in example:
            return example['conversation']
        elif 'question' in example and 'answer' in example:
            return 'User: ' + example['question'] + '\nAssistant: ' + example['answer']
        else:
            return str(example)
    else:
        return str(example)

In [9]:
# Function to apply formatting to a dataset
def preprocess_dataset(dataset, source):
    if dataset is None:
        return None
    
    # Each dataset is loaded with a split key, e.g., 'train'
    # We'll process the 'train' split if available, otherwise the default
    split = 'train' if 'train' in dataset else list(dataset.keys())[0]
    data = dataset[split]
    
    # Print a sample to understand the structure
    print(f"Sample from {source}:")
    for key in list(data[0].keys())[:5]:  # Show first 5 keys
        print(f"  {key}")
    
    formatted = data.map(lambda x: {'text': format_sample(x, source)})
    return formatted

In [11]:
print('Preprocessing datasets...')
datasets_to_process = [
    (mh_faq, 'mh_faq'),
    (mh_counseling, 'mh_counseling'),
    (ai_medical, 'ai_medical'),
    (chatdoctor, 'chatdoctor'),
    (mh_chatbot, 'mh_chatbot')
]

formatted_datasets = []
for dataset, source in datasets_to_process:
    if dataset is not None:
        formatted = preprocess_dataset(dataset, source)
        if formatted is not None:
            formatted_datasets.append(formatted)
            print(f"Added {source} to formatted datasets")

print('Datasets preprocessed.')

Preprocessing datasets...
Sample from mh_faq:
  Question_ID
  Questions
  Answers


Map: 100%|██████████| 98/98 [00:00<00:00, 4082.49 examples/s]


Added mh_faq to formatted datasets
Sample from mh_counseling:
  Context
  Response


Map: 100%|██████████| 3512/3512 [00:00<00:00, 12841.37 examples/s]


Added mh_counseling to formatted datasets
Sample from ai_medical:
  Description
  Patient
  Doctor


Map: 100%|██████████| 256916/256916 [00:17<00:00, 14875.68 examples/s]


Added ai_medical to formatted datasets
Sample from chatdoctor:
  instruction
  input
  output


Map: 100%|██████████| 112165/112165 [00:06<00:00, 17258.57 examples/s]


Added chatdoctor to formatted datasets
Sample from mh_chatbot:
  text


Map: 100%|██████████| 172/172 [00:00<00:00, 17195.92 examples/s]

Added mh_chatbot to formatted datasets
Datasets preprocessed.





In [12]:
# Concatenate all datasets into one
if formatted_datasets:
    combined_dataset = concatenate_datasets(formatted_datasets)
    print('Combined dataset length:', len(combined_dataset))
    print('Sample combined text:')
    print(combined_dataset[0]['text'][:500])  # print first 500 chars
else:
    print("No datasets were successfully formatted. Please check the dataset loading.")

Combined dataset length: 372863
Sample combined text:
{'Question_ID': 1590140, 'Questions': 'What does it mean to have a mental illness?', 'Answers': 'Mental illnesses are health conditions that disrupt a personâ€™s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the 


In [13]:
# Load a pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Resize token embeddings if new tokens are added
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [14]:
# Tokenize function
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=512)

print('Tokenizing dataset...')
combined_dataset = combined_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

print('Tokenization complete.')
print('Dataset size:', len(combined_dataset))
print('Sample tokenized data:')
print(combined_dataset[0])


Tokenizing dataset...


Map: 100%|██████████| 372863/372863 [01:46<00:00, 3515.18 examples/s]

Tokenization complete.
Dataset size: 372863
Sample tokenized data:
{'Question_ID': 1590140, 'Questions': 'What does it mean to have a mental illness?', 'Answers': 'Mental illnesses are health conditions that disrupt a personâ€™s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a personâ€™s character, intelligence, or willpower. Just as diabetes 




In [15]:

# Create training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    report_to='none'
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [16]:
print('Starting training...')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    data_collator=data_collator,
)

trainer.train()

print('Training complete.')

# Save the model
model_save_path = './fine_tuned_mental_health_chatbot'
os.makedirs(model_save_path, exist_ok=True)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print('Model saved to ' + model_save_path)

Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


KeyboardInterrupt: 

In [None]:

# Test the model with a sample question
def generate_response(prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    # Generate response
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Test with a sample mental health question
test_prompt = "User: I've been feeling really anxious lately. What can I do to manage my anxiety?\nAssistant:"
response = generate_response(test_prompt)
print("Sample response:")
print(response)


In [None]:

print('Notebook execution complete. Fine-tuned model is ready.')
