In [3]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [4]:
# Load embeddings from pickle files
with open('./mental_health_datasets/ner_embeddings.pkl', 'rb') as f:
    ner_embeddings = pickle.load(f)  # Adjust the loading method if necessary

with open('./mental_health_datasets/sentiment_embeddings.pkl', 'rb') as f:
    sentiment_embeddings = pickle.load(f)  # Adjust the loading method if necessary

ner_embeddings = torch.tensor(ner_embeddings)
sentiment_embeddings = torch.tensor(sentiment_embeddings)
# Combine embeddings
embeddings = [torch.cat((ner, sentiment), dim=0) for ner, sentiment in zip(ner_embeddings, sentiment_embeddings)]
embeddings = torch.stack(embeddings)

# Assuming you have a CSV file containing the labels
data = pd.read_csv('./mental_health_datasets/labels_data.csv')  # Contains the 'Category' column

# Extract labels
# labels = data['Category'].values  # Assuming 'Category' is your target column
labels = ["Positive", "Negative", "Neutral"] * (len(embeddings) // 3)  # Adjust as necessary

# Train-test split
train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(embeddings, labels, test_size=0.2, random_state=42)


In [5]:
# Define your dataset class
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.embeddings[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = EmbeddingDataset(train_embeddings, train_labels)
test_dataset = EmbeddingDataset(test_embeddings, test_labels)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(labels)))

# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def train_model(model, train_dataloader, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            labels = batch['labels']
            outputs = model(input_ids=input_ids.unsqueeze(0), labels=labels)  # Adjust for batch size
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            print(f"Loss: {loss.item()}")

# Evaluation function
def evaluate_model(model, test_dataloader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids']
            labels = batch['labels']
            outputs = model(input_ids=input_ids.unsqueeze(0))  # Adjust for batch size
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions
    print(f"Accuracy on the test set: {accuracy:.4f}")


In [7]:
# Start training
train_model(model, train_dataloader)

Epoch 1/3


TypeError: new(): invalid data type 'str'

In [None]:
# Evaluate the model
evaluate_model(model, test_dataloader)