In [1]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pandas as pd

# Load Mental BERT

In [3]:
# Load model and tokenizer
model_name = "mental/mental-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else 
                    "mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def get_embeddings(texts, batch_size=64):
    """Extract CLS token embeddings in batches"""
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        
        # Tokenize and convert to tensors
        inputs = tokenizer(
            batch, 
            padding=True, 
            truncation=True, 
            max_length=512, 
            return_tensors="pt"
        ).to(device)
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use CLS token as sentence embedding
        batch_embeddings = outputs.last_hidden_state[:,0,:].cpu().numpy()
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)

# Batch preprocessing of text

In [25]:
# Load preprocessed data
train_df = pd.read_csv('data/train_preprocessed_fill_missing.csv')
test_df = pd.read_csv('data/test_preprocessed_fill_missing.csv')


In [26]:
# Check test data before processing
print("Missing values in test cleaned_text:", test_df['cleaned_text'].isna().sum())
print("Non-string values:", test_df[~test_df['cleaned_text'].apply(lambda x: isinstance(x, str))].shape[0])

# Force convert to strings
test_df['cleaned_text'] = test_df['cleaned_text'].astype(str)

Missing values in test cleaned_text: 0
Non-string values: 0


In [17]:
train_df.head()

Unnamed: 0,id,cleaned_text,target
0,11098,post remove request member hi welcome immediat...,suicidal-thoughts-and-self-harm
1,116,hi nmtb thank post think lot people terrify st...,anxiety
2,7189,hello cas fair anxiety depression work lot com...,anxiety
3,4350,hey everyone discover another mum 's sister de...,anxiety
4,9749,hi everyone guess title say really .. 28 year ...,depression


In [27]:

# Generate embeddings (preserve order)
print("Generating training embeddings...")
train_embeddings = get_embeddings(train_df['cleaned_text'].tolist())

print("Generating test embeddings...")
test_embeddings = get_embeddings(test_df['cleaned_text'].tolist())

Generating training embeddings...
Generating test embeddings...


100%|██████████| 39/39 [01:21<00:00,  2.08s/it]


# Save embeddings for later reuse

In [28]:
# Save embeddings with IDs for reference
np.save('data/train_embeddings.npy', train_embeddings)
np.save('data/test_embeddings.npy', test_embeddings)

In [29]:
test_embeddings.shape

(2462, 768)

# Enriching features

In [1]:
# Add metadata features to embeddings
from textstat import flesch_reading_ease
from transformers import pipeline

In [2]:


sentiment_analyzer = pipeline("sentiment-analysis", device='cuda:0')  # GPU

def enrich_features(text):
    return [
        len(text.split()),  # Word count
        flesch_reading_ease(text),  # Readability score
        sentiment_analyzer(text[:512])[0]['score']  # Sentiment score
    ]


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


AttributeError: module 'torch' has no attribute 'frombuffer'

## TF-IDF features

In [18]:
# TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz

In [9]:
train = pd.read_csv('data/train_preprocessed_fill_missing.csv')
test = pd.read_csv('data/test_preprocessed_fill_missing.csv')

In [None]:

# Defining our vectorizer with total words of 5000 and with bigram model


In [None]:
TF_IDF = TfidfVectorizer(max_features = 4096, ngram_range = (2, 2))
X = TF_IDF.fit_transform(train["cleaned_text"])

In [14]:
X.shape

(22151, 4096)

In [20]:
save_npz('data/train_tfidf_embeddings.npz', X)

In [16]:
X_test = TF_IDF.transform(test['cleaned_text'])


In [21]:
save_npz('data/test_tfidf_embeddings.npz', X_test)