In [1]:
import pandas as pd

REDDIT = '/kaggle/input/stress-detection-from-social-media-articles/Reddit_Combi.csv'
USECOLS = ['Body_Title', 'label']
df = pd.read_csv(filepath_or_buffer=REDDIT, sep=';', usecols=USECOLS)
df['label'] = df['label'] == 1
df.head()

Unnamed: 0,Body_Title,label
0,Envy to other is swallowing me Im from develop...,True
1,Nothin outta the ordinary. Paradise. Job stres...,True
2,Almost 49 and the chasm of emptiness has never...,True
3,I’m happy again After my closest friend left m...,False
4,Is it possible to recover from such a traumati...,True


Are our classes balanced?

In [2]:
df['label'].value_counts(normalize=True).to_frame().T

label,True,False
proportion,0.878963,0.121037


No they are not; the False/0 class outnumbers the True/1 class about eight to one.

In [3]:
from plotly import express
from plotly.io import renderers

express.histogram(x=df['Body_Title'].str.len(), log_y=True)

Most of the time our content is relatively short as measured in characters.

In [4]:
express.histogram(x=df['Body_Title'].str.split().str.len(), log_y=True)

Let's first try a doc2vec model to transform our documents into vectors we can use for classification. We need to first turn our documents into a gensim corpus.

In [5]:
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_tags
CUSTOM_FILTERS = [lambda x: x.lower(), 
                  remove_stopwords, 
                  strip_multiple_whitespaces, 
                  strip_numeric,
                  strip_punctuation,
                  strip_short,
                  strip_tags, 
                 ]
documents = df['Body_Title'].values.tolist()
texts = [preprocess_string(s=document, filters=CUSTOM_FILTERS) for document in documents]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(dictionary)

Dictionary<14372 unique tokens: ['afford', 'age', 'beetwen', 'better', 'big']...>


Now we are ready to train our doc2vec model.

In [6]:
from arrow import now
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
doc2vec_start = now()
doc2vec_model = Doc2Vec(vector_size=100, min_count=20, epochs=40)
corpus_iterable = [TaggedDocument(item, [index]) for index, item in enumerate(corpus) ]
doc2vec_model.build_vocab(corpus_iterable=corpus_iterable)
doc2vec_model.train(corpus_iterable=corpus_iterable, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs,)
df['vectors'] = doc2vec_model.dv.vectors.tolist()
print('doc2vec training time: {}'.format(now() - doc2vec_start))

doc2vec training time: 0:00:18.230343


Let's use dimension reduction to see if our document vectors contain a signal that will be easy for a model to find; i.e. do our labels cluster in a UMAP projection?

In [7]:
from umap import UMAP

doc2vec_umap_start = now()
doc2vec_umap_model = UMAP(n_components=2, random_state=2024, verbose=1, init='pca', n_jobs=1)
df[['x', 'y']] = doc2vec_umap_model.fit_transform(X=df['vectors'].apply(func=pd.Series),)
df['short document'] = df['Body_Title'].str[:80]
print('doc2vec umap time: {}'.format(now() - doc2vec_umap_start))

2025-07-07 16:14:59.940674: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-07 16:14:59.940816: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-07 16:15:00.109129: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(init='pca', n_jobs=1, random_state=2024, verbose=1)
Mon Jul  7 16:15:13 2025 Construct fuzzy simplicial set
Mon Jul  7 16:15:19 2025 Finding Nearest Neighbors
Mon Jul  7 16:15:23 2025 Finished Nearest Neighbor Search
Mon Jul  7 16:15:26 2025 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Jul  7 16:15:33 2025 Finished embedding
doc2vec umap time: 0:00:20.704430


In [8]:
express.scatter(data_frame=df, x='x', y='y', color='label', height=800, hover_name='short document')





This is not particularly encouraging. Doc2vec does not appear to cluster our documents according to their labels.

In [9]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['vectors'].apply(func=pd.Series), df['label'], test_size=0.25, random_state=2024, stratify=df['label'])

time_start = arrow.now()
regression = LogisticRegression(max_iter=100000, tol=1e-12).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(regression.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=regression.predict(X=X_test))))
print('model done in {}'.format(now() - time_start))

model fit in 57 iterations took 0:00:00.058061
accuracy: 0.8886
model done in 0:00:00.068969


An accuracy of nearly 0.9 seems encouraging, but our classes are unbalanced to the point that we can get an accuracy of nearly 0.9 with a dummy model that labels every document 1. Let's look at the classification report.

In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=regression.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.68      0.16      0.26        95
        True       0.89      0.99      0.94       686

    accuracy                           0.89       781
   macro avg       0.79      0.57      0.60       781
weighted avg       0.87      0.89      0.86       781



Our doc2vec/logistic regression solution does poorly with the 0 class.

In [11]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

gauss = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=2024)
gauss.fit(X=X_train, y=y_train)

print(classification_report(y_true=y_test, y_pred=gauss.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.61      0.24      0.35        95
        True       0.90      0.98      0.94       686

    accuracy                           0.89       781
   macro avg       0.75      0.61      0.64       781
weighted avg       0.87      0.89      0.87       781



Our Gaussian Process model doesn't do much better.


Can we do better with BERT embeddings? Let's find out.

In [12]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false



os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



pip install keybert complete.


In [13]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 4
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
DOCS = df['Body_Title'].values.tolist()

model_start = now()
bert = KeyBERT(model=MODEL,)
bert.max_seq_length = 512
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = bert.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding time: 0:03:05.822236
we have 3123 documents and 4663 words.


In [14]:
df['embedding'] = document_embeddings.tolist()
embedding_umap_model = UMAP(n_components=2, random_state=2024, verbose=1, init='pca', n_jobs=1)
df[['ex', 'ey']] = embedding_umap_model.fit_transform(X=df['embedding'].apply(func=pd.Series),)
express.scatter(data_frame=df, x='ex', y='ey', color='label', height=800, hover_name='short document')

UMAP(init='pca', n_jobs=1, random_state=2024, verbose=1)
Mon Jul  7 16:20:59 2025 Construct fuzzy simplicial set
Mon Jul  7 16:21:05 2025 Finding Nearest Neighbors
Mon Jul  7 16:21:06 2025 Finished Nearest Neighbor Search
Mon Jul  7 16:21:06 2025 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Jul  7 16:21:12 2025 Finished embedding


This is much better. Let's see how our models do now.

In [15]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

Xe_train, Xe_test, ye_train, ye_test = train_test_split(df['embedding'].apply(func=pd.Series), df['label'], test_size=0.25, random_state=2024, stratify=df['label'])

time_start = arrow.now()
embedding_regression = LogisticRegression(max_iter=100000, tol=1e-12).fit(X=Xe_train, y=ye_train)
print('model fit in {} iterations took {}'.format(embedding_regression.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=ye_test, y_pred=embedding_regression.predict(X=Xe_test))))
print('model done in {}'.format(now() - time_start))

model fit in 25 iterations took 0:00:00.040424
accuracy: 0.9449
model done in 0:00:00.066001


In [16]:
print(classification_report(y_true=ye_test, y_pred=embedding_regression.predict(X=Xe_test)))

              precision    recall  f1-score   support

       False       0.92      0.60      0.73        95
        True       0.95      0.99      0.97       686

    accuracy                           0.94       781
   macro avg       0.93      0.80      0.85       781
weighted avg       0.94      0.94      0.94       781



Logistic Regression performs much better on the 0 class with embeddings than with document vectors.

In [17]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha=1.0, max_iter=100000, random_state=2024).fit(X=Xe_train, y=ye_train)
print('score: {:5.4f}'.format(mlp.score(X=Xe_test, y=ye_test)))

print(classification_report(y_true=ye_test, y_pred=mlp.predict(X=Xe_test)))

score: 0.9462
              precision    recall  f1-score   support

       False       0.92      0.61      0.73        95
        True       0.95      0.99      0.97       686

    accuracy                           0.95       781
   macro avg       0.93      0.80      0.85       781
weighted avg       0.95      0.95      0.94       781



We can do ever so slightly better with an sklearn neural net model.