In [1]:
import pandas as pd

REDDIT = '/kaggle/input/stress-detection-from-social-media-articles/Reddit_Combi.csv'
USECOLS = ['Body_Title', 'label']
df = pd.read_csv(filepath_or_buffer=REDDIT, sep=';', usecols=USECOLS)
df.head()

Unnamed: 0,Body_Title,label
0,Envy to other is swallowing me Im from develop...,1
1,Nothin outta the ordinary. Paradise. Job stres...,1
2,Almost 49 and the chasm of emptiness has never...,1
3,I’m happy again After my closest friend left m...,0
4,Is it possible to recover from such a traumati...,1


In [2]:
from plotly import express
express.pie(data_frame=df, names='label', color='label')

Our classes are unbalanced.

In [3]:
express.histogram(x=df['Body_Title'].str.len(), log_y=True)

Most of the time our content is relatively short as measured in characters.

In [4]:
express.histogram(x=df['Body_Title'].str.split().str.len(), log_y=True)

In [5]:
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_tags
CUSTOM_FILTERS = [lambda x: x.lower(), 
                  remove_stopwords, 
                  strip_multiple_whitespaces, 
                  strip_numeric,
                  strip_punctuation,
                  strip_short,
                  strip_tags, 
                 ]
documents = df['Body_Title'].values.tolist()
texts = [preprocess_string(s=document, filters=CUSTOM_FILTERS) for document in documents]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(dictionary)

Dictionary<14372 unique tokens: ['afford', 'age', 'beetwen', 'better', 'big']...>


In [6]:
from arrow import now
# this determines our runtime and also helps determine how many low-frequency tokens we keep
MAX_VOCAB_SIZE = 5000
from gensim.models import Word2Vec
time_start = now()
word2vec_model = Word2Vec(sentences=texts, vector_size=100, window=5, workers=4, seed=2023, max_vocab_size=MAX_VOCAB_SIZE)
print('vocabulary size: {}'.format(len(word2vec_model.wv)))
print('word2vec time: {}'.format(now() - time_start))

vocabulary size: 2025
word2vec time: 0:00:01.001803


In [7]:
from math import log10
from pandas import DataFrame
from sklearn.manifold import TSNE
time_start = now()
init = ['pca', 'random'][0] # choose this to see different shapes
N_COMPONENTS = 3 # we get more diffusion if we get 3 t-sne components
tsne = TSNE(random_state=2023, n_iter=10000, verbose=1, init=init, n_components=N_COMPONENTS,)
tsne_result = tsne.fit_transform(X=word2vec_model.wv.vectors)
tsne_df = DataFrame(data=tsne_result, columns=['x', 'y', 'z'])
tsne_df['word'] = list(word2vec_model.wv.key_to_index.keys())
tsne_df['weight'] = tsne_df['word'].apply(func=lambda x: word2vec_model.wv[x].sum())
tsne_df['count'] = tsne_df['word'].apply(func=lambda x: log10(word2vec_model.wv.get_vecattr(key=x, attr='count')))
print(now() - time_start)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2025 samples in 0.001s...
[t-SNE] Computed neighbors for 2025 samples in 0.149s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2025
[t-SNE] Computed conditional probabilities for sample 2000 / 2025
[t-SNE] Computed conditional probabilities for sample 2025 / 2025
[t-SNE] Mean sigma: 0.059485
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.946880
[t-SNE] KL divergence after 4850 iterations: 0.643210
0:02:27.910442


In [8]:
express.scatter(data_frame=tsne_df, x='x', y='y', hover_name='word', color='weight')

In [9]:
from pandas import DataFrame
from plotly.express import scatter
from umap import UMAP

init = ['pca',  'random', 'spectral', 'tswspectral'][0]
columns = ['u0', 'u1', 'u2']
umap_model = UMAP(n_components=3, random_state=2023, verbose=1, init=init, n_jobs=1)
umap_df = DataFrame(data=umap_model.fit_transform(X=word2vec_model.wv.vectors,), columns=columns)
umap_df['word'] = list(word2vec_model.wv.key_to_index.keys())
umap_df['weight'] = umap_df['word'].apply(func=lambda x: word2vec_model.wv[x].sum())
umap_df['count'] = umap_df['word'].apply(func=lambda x: log10(word2vec_model.wv.get_vecattr(key=x, attr='count')))

scatter(data_frame=umap_df, x='u0', y='u1', hover_name='word', color='weight')

2024-03-23 17:48:26.084219: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-23 17:48:26.084351: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-23 17:48:26.269179: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(init='pca', n_components=3, n_jobs=1, random_state=2023, verbose=1)
Sat Mar 23 17:48:39 2024 Construct fuzzy simplicial set
Sat Mar 23 17:48:45 2024 Finding Nearest Neighbors
Sat Mar 23 17:48:49 2024 Finished Nearest Neighbor Search
Sat Mar 23 17:48:53 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Mar 23 17:48:57 2024 Finished embedding


In [10]:
from arrow import now
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
doc2vec_start = now()
doc2vec_model = Doc2Vec(vector_size=100, min_count=20, epochs=40)
corpus_iterable = [TaggedDocument(item, [index]) for index, item in enumerate(corpus) ]
doc2vec_model.build_vocab(corpus_iterable=corpus_iterable)
doc2vec_model.train(corpus_iterable=corpus_iterable, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs,)
print('doc2vec training time: {}'.format(now() - doc2vec_start))

doc2vec training time: 0:00:20.406624


In [11]:
df['vectors'] = doc2vec_model.dv.vectors.tolist()

In [12]:
df.shape

(3123, 3)

In [13]:
from umap import UMAP

doc2vec_umap_start = now()
doc2vec_umap_model = UMAP(n_components=2, random_state=2024, verbose=1, init='pca', n_jobs=1)
df[['x', 'y']] = doc2vec_umap_model.fit_transform(X=df['vectors'].apply(func=pd.Series),)
df['short document'] = df['Body_Title'].str[:80]
print('doc2vec umap time: {}'.format(now() - doc2vec_umap_start))

UMAP(init='pca', n_jobs=1, random_state=2024, verbose=1)
Sat Mar 23 17:49:19 2024 Construct fuzzy simplicial set
Sat Mar 23 17:49:29 2024 Finding Nearest Neighbors
Sat Mar 23 17:49:30 2024 Finished Nearest Neighbor Search
Sat Mar 23 17:49:30 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Mar 23 17:49:37 2024 Finished embedding
doc2vec umap time: 0:00:18.775720


In [14]:
# color=['u2', 'language'][0]
scatter(data_frame=df, x='x', y='y', color='label', height=800, hover_name='short document')

This is not particularly encouraging. Doc2vec does not appear to cluster our documents according to their labels.

In [15]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['vectors'].apply(func=pd.Series), df['label'], test_size=0.25, random_state=2024, stratify=df['label'])

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-12).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('model done in {}'.format(now() - time_start))

model fit in 59 iterations took 0:00:00.062676
accuracy: 0.8860
model done in 0:00:00.074141


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.64      0.15      0.24        95
           1       0.89      0.99      0.94       686

    accuracy                           0.89       781
   macro avg       0.76      0.57      0.59       781
weighted avg       0.86      0.89      0.85       781



Our doc2vec/logistic regression solution does poorly with the 0 class.

In [17]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

gauss = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=2024)
gauss.fit(X=X_train, y=y_train)

print(classification_report(y_true=y_test, y_pred=gauss.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.64      0.22      0.33        95
           1       0.90      0.98      0.94       686

    accuracy                           0.89       781
   macro avg       0.77      0.60      0.63       781
weighted avg       0.87      0.89      0.87       781



Our Gaussian Process model doesn't do much better.
