In [None]:
from google.colab import drive

# This will open a pop-up to ask for permission.
print("Connecting to Google Drive...")
drive.mount('/content/drive')
print("✅ Drive mounted successfully.")

Connecting to Google Drive...
Mounted at /content/drive
✅ Drive mounted successfully.


In [None]:
# Install transformers (good practice, though often pre-installed)
!pip install transformers

import os
import zipfile
from google.colab import files

print("Please upload your 'total_df.csv' file:")
uploaded_csv = files.upload()

print("\nPlease upload your 'DAICWOZ_MAIN.zip' file:")
uploaded_zip = files.upload()

# --- Configuration ---
# Get the filenames you just uploaded
total_df_path = list(uploaded_csv.keys())[0]
zip_path = list(uploaded_zip.keys())[0]

# Define paths for our data
extract_path = "/content/DAICWOZ_DATA" # We will unzip here
dataset_path = os.path.join(extract_path, "DAICWOZ_MAIN") # <-- IMPORTANT: Change 'DAICWOZ_MAIN' if the folder inside your zip has a different name
model_name = "distilbert-base-uncased"
# --- End Configuration ---

# Unzip the data
print(f"\nUnzipping {zip_path} to {extract_path}...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Unzip complete. Files are ready.")

Please upload your 'total_df.csv' file:


Saving total_df.csv to total_df.csv

Please upload your 'DAICWOZ_MAIN.zip' file:


In [None]:
import os
import re
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

print(f"Using TensorFlow version: {tf.__version__}")
print(f"Using NumPy version: {np.__version__}")

In [None]:
def parse_daicwoz_transcript(filepath):
    """
    Parses the DAIC-WOZ transcript file.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    header = ['start_time', 'stop_time', 'speaker', 'value']
    rows = []
    for line in lines[1:]:  # Skip header row in file
        try:
            line_split = re.split(r'\t| {2,}', line.strip())
            if len(line_split) == 4:
                rows.append(line_split)
            elif len(line_split) > 4:
                rows.append(line_split[:3] + [' '.join(line_split[3:])])
            else:
                line_split_alt = re.split(r'\s+', line.strip())
                if len(line_split_alt) >= 4:
                    rows.append(line_split_alt[:4])
        except Exception as e:
            print(f"Skipping line in {filepath} due to error: {e}. Line: {line}")

    df = pd.DataFrame(rows, columns=header)
    return df

def load_all_transcripts(parent_dir):
    """
    Iterates through all session folders, parses transcripts,
    and extracts only 'Participant' text.
    """
    print(f"Loading all transcripts from: {parent_dir}")
    texts, participant_ids = [], []

    # Check if the path exists
    if not os.path.exists(parent_dir):
        print(f"ERROR: The directory '{parent_dir}' does not exist.")
        print("Please check the 'dataset_path' variable in Step 1.")
        print("It must match the top-level folder name inside your .zip file.")
        return pd.DataFrame()

    session_list = [f for f in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, f)) and f.endswith('_P')]

    for session_folder in session_list:
        session_path = os.path.join(parent_dir, session_folder)
        participant_id = session_folder.split('_')[0]

        try:
            transcript_file_name = [f for f in os.listdir(session_path) if 'TRANSCRIPT' in f and f.endswith('.csv')][0]
            transcript_filepath = os.path.join(session_path, transcript_file_name)
        except IndexError:
            print(f"Warning: No '...TRANSCRIPT.csv' file found in {session_folder}. Skipping.")
            continue

        try:
            df_t = parse_daicwoz_transcript(transcript_filepath)
            utterances = df_t[df_t['speaker'].str.strip().str.lower() == 'participant']['value'].dropna().astype(str).tolist()

            if not utterances:
                print(f"Warning: No participant utterances found for {participant_id}. Skipping.")
                continue

            full_text = " ".join(utterances)
            texts.append(full_text)
            participant_ids.append(participant_id)

        except Exception as e:
            print(f"Error parsing transcript for {participant_id}: {e}")

    print(f"Successfully loaded and processed {len(texts)} transcripts.")
    return pd.DataFrame({
        "participant_id": participant_ids,
        "text": texts
    })

# --- Execute Data Loading ---
text_df = load_all_transcripts(dataset_path)

if not text_df.empty:
    print(f"Loading labels from {total_df_path}...")
    labels_df = pd.read_csv(total_df_path)

    print(f"Merging {len(text_df)} transcripts with {len(labels_df)} labels.")
    text_df['participant_id'] = text_df['participant_id'].astype(str)
    labels_df['Participant_ID'] = labels_df['Participant_ID'].astype(str)

    df = pd.merge(text_df, labels_df, left_on='participant_id', right_on='Participant_ID')

    df = df[['participant_id', 'text', 'PHQ8_Binary']]
    df = df.rename(columns={'PHQ8_Binary': 'label'})
    df = df.dropna(subset=['label'])
    df['label'] = df['label'].astype(int)

    print("\nData loading complete. DataFrame head:")
    print(df.head())
    print(f"\nTotal records loaded: {len(df)}")
    print("\nData distribution:")
    print(df['label'].value_counts())
else:
    print("Data loading failed. Please check your paths and zip file structure.")

In [None]:
print("\nPreprocessing data...")
# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text
train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
test_enc = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

# Create TensorFlow datasets
train_ds = tf.data.Dataset.from_tensor_slices((dict(train_enc), train_labels)).shuffle(1000).batch(8)
test_ds = tf.data.Dataset.from_tensor_slices((dict(test_enc), test_labels)).batch(8)
print("Data preprocessing complete.")

print("\nBuilding and training model...")
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")

model.compile(optimizer=optimizer, loss=loss_fn, metrics=[acc_metric])

epochs = 5
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=epochs
)

print("Model training complete.")

In [None]:
print("\n--- Model Performance ---")
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()

print("\n--- Test Set Evaluation ---")
predictions = model.predict(test_ds)
y_pred = np.argmax(tf.nn.softmax(predictions.logits, axis=-1), axis=1)
y_true = np.concatenate([y for x, y in test_ds], axis=0) # Correct way to get all labels from test_ds

target_names = ['NO (Not Depressed)', 'YES (Depressed)']
print(classification_report(y_true, y_pred, target_names=target_names))
print("--- Confusion Matrix ---")
print(confusion_matrix(y_true, y_pred))

In [None]:
# Define a path to save the model
save_directory = "/content/my_depression_model"

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

# Optional: Zip the model for easy download
!zip -r /content/my_depression_model.zip /content/my_depression_model

print("\nModel zipped. You can download 'my_depression_model.zip' from the file panel on the left.")