In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from transformers import BertModel

2025-07-25 09:02:04.446361: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753434124.646236      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753434124.706121      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
class MultimodalSuicideRiskModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', n_structured_features=4):
        super(MultimodalSuicideRiskModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert_output_size = self.bert.config.hidden_size
        self.structured_mlp = nn.Sequential(
            nn.Linear(n_structured_features, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.mlp_output_size = 16
        combined_feature_size = self.bert_output_size + self.mlp_output_size
        self.prediction_head = nn.Sequential(
            nn.Linear(combined_feature_size, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, input_ids, attention_mask, structured_data):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = bert_output.pooler_output
        structured_features = self.structured_mlp(structured_data)
        combined_features = torch.cat((text_features, structured_features), dim=1)
        final_prediction = self.prediction_head(combined_features)
        return final_prediction

In [3]:
class SuicideDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, scaler):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.user_text
        self.targets = dataframe.intention_score
        # Scale the age column
        self.data['age_scaled'] = scaler.transform(self.data[['age']])
        self.structured = self.data[['age_scaled', 'gender_male', 'gender_female', 'gender_non_binary']].values
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])

        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Return a dictionary of all the required data
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'structured_data': torch.tensor(self.structured[index], dtype=torch.float),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [4]:
# --- 2. Load and Prepare the Data ---
df = pd.read_csv('/kaggle/input/sucide-intention-dataset/suicide_intention_dataset.csv')

# Split data into train, validation, and test sets (70-15-15 split)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# --- THE FIX IS HERE ---
# Reset the index of each dataframe to prevent KeyErrors
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# ---------------------

# It's good practice to scale numerical features like age
age_scaler = StandardScaler()
age_scaler.fit(train_df[['age']]) # Fit ONLY on the training data

# --- 3. Set Up Tokenizer and DataLoaders ---
# (This part remains the same)
MAX_LEN = 160
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SuicideDataset(train_df, TOKENIZER, MAX_LEN, age_scaler)
val_dataset = SuicideDataset(val_df, TOKENIZER, MAX_LEN, age_scaler)
test_dataset = SuicideDataset(test_df, TOKENIZER, MAX_LEN, age_scaler)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Train set size: 5600
Validation set size: 1200
Test set size: 1200


In [5]:
# --- 4. Define the Training and Validation Functions ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    total_loss = 0

    for d in data_loader:
        # Move data to the selected device (GPU/CPU)
        input_ids = d["input_ids"].to(device, dtype=torch.long)
        attention_mask = d["attention_mask"].to(device, dtype=torch.long)
        structured_data = d["structured_data"].to(device, dtype=torch.float)
        targets = d["targets"].to(device, dtype=torch.float)

        # Get model outputs
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            structured_data=structured_data
        )

        # Calculate loss
        loss = loss_fn(outputs.squeeze(), targets)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    total_loss = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device, dtype=torch.long)
            attention_mask = d["attention_mask"].to(device, dtype=torch.long)
            structured_data = d["structured_data"].to(device, dtype=torch.float)
            targets = d["targets"].to(device, dtype=torch.float)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                structured_data=structured_data
            )

            loss = loss_fn(outputs.squeeze(), targets)
            total_loss += loss.item()

    return total_loss / len(data_loader)


In [6]:
# --- 5. Run the Training ---
EPOCHS = 10
model = MultimodalSuicideRiskModel().to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss() # Mean Squared Error is good for regression

best_val_loss = float('inf')

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, DEVICE)
    print(f'Train loss: {train_loss:.4f}')

    val_loss = eval_model(model, val_loader, loss_fn, DEVICE)
    print(f'Validation loss: {val_loss:.4f}')

    # Save the model if it has the best validation loss so far
    if val_loss < best_val_loss:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_val_loss = val_loss

print("\nTraining finished!")
print(f"Best validation loss: {best_val_loss:.4f}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/10
----------
Train loss: 9.1991
Validation loss: 2.0610
Epoch 2/10
----------
Train loss: 1.9965
Validation loss: 0.9972
Epoch 3/10
----------
Train loss: 1.5987
Validation loss: 0.9813
Epoch 4/10
----------
Train loss: 1.5671
Validation loss: 0.9870
Epoch 5/10
----------
Train loss: 1.4980
Validation loss: 1.0025
Epoch 6/10
----------
Train loss: 1.4686
Validation loss: 0.9885
Epoch 7/10
----------
Train loss: 1.4390
Validation loss: 0.9836
Epoch 8/10
----------
Train loss: 1.4960
Validation loss: 0.9809
Epoch 9/10
----------
Train loss: 1.4475
Validation loss: 0.9686
Epoch 10/10
----------
Train loss: 1.4441
Validation loss: 0.9982

Training finished!
Best validation loss: 0.9686
