In [None]:
import pandas as pd

data_path = '/content/drive/MyDrive/Colab Notebooks/preprocessed_data.csv'
data = pd.read_csv(data_path)

print(data.head())
print(data['subreddit'].value_counts())


   Unnamed: 0     subreddit                                               text
0           0  suicidewatch  there s point continue lose job last friday I ...
1           1  suicidewatch  friend keep find reddit account hate friend mu...
2           2  suicidewatch  tired throwaway account I ve marry year think ...
3           3  suicidewatch  think mom might commit suicide I ’ve talk mom ...
4           4  suicidewatch  do hii know anyone read I m write this I m wal...
subreddit
depression      26241
suicidewatch    25944
casual          25874
anxiety         25870
mentalhealth    25325
lonely          23635
Name: count, dtype: int64


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import numpy as np

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

data['text_length'] = data['text'].apply(len)
data['sentiment'] = data['text'].apply(lambda x: sid.polarity_scores(x)['compound'])

tfidf = TfidfVectorizer(max_features=500)
tfidf_features = tfidf.fit_transform(data['text'])

features = np.hstack((tfidf_features.toarray(), data[['text_length', 'sentiment']].values))

labels = data['subreddit'].factorize()[0]

print(features.shape)
print(labels.shape)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


(152889, 502)
(152889,)


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

poly = PolynomialFeatures(degree=2, interaction_only=True)

mlp = MLPClassifier(hidden_layer_sizes=(35,), max_iter=200, random_state=42)


model = Pipeline([
    ('poly', poly),
    ('mlp', mlp)
])


X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=data['subreddit'].unique(), yticklabels=data['subreddit'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
