In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
df=pd.read_csv('preprocessed_data.csv')

In [None]:
df = df.drop(columns=['Unnamed: 0'])


In [None]:
label_encoder = LabelEncoder()
df['subreddit'] = label_encoder.fit_transform(df['subreddit'])
num_classes = len(label_encoder.classes_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['subreddit'], test_size=0.2, random_state=42)


In [None]:
max_words = 50000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history = model.fit(X_train_pad, y_train_cat, batch_size=32, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support # Import the missing function

y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_classes, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Accuracy: 0.6564196481130224
Precision: 0.6507132627012306
Recall: 0.658034591321496
F1 Score: 0.6535821210921325


In [None]:
y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Classification Report:\n", classification_report(y_test, y_pred_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_classes))

Accuracy: 0.6564196481130224
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.73      0.73      5220
           1       0.94      0.97      0.95      5216
           2       0.44      0.37      0.40      5287
           3       0.69      0.73      0.71      4645
           4       0.53      0.51      0.52      5163
           5       0.57      0.63      0.60      5047

    accuracy                           0.66     30578
   macro avg       0.65      0.66      0.65     30578
weighted avg       0.65      0.66      0.65     30578

Confusion Matrix:
 [[3807   35  324  175  714  165]
 [  19 5041   29   58   16   53]
 [ 321   62 1974  635  992 1303]
 [ 158  115  492 3390  199  291]
 [ 789   32  829  300 2656  557]
 [ 152   67  859  321  444 3204]]


In [None]:
model.save('reddit_model1.h5')
with open('tokenizer1.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

with open('label_encoder1.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

  saving_api.save_model(


In [None]:
import string
def preprocess_text(text):
  text = re.sub('<.*?>', '', text)  # Remove tags
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
  text = re.sub('\n', '', text)  # Remove newlines
  text = re.sub('[0-9]+', '', text)  # Remove numbers
  text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
  text = text.lower()  # Lowercase the text
  text = re.sub(r'\W', ' ', text)  # Remove special characters

  return text


In [None]:
def predict_text_label(text):
    text = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(pad)
    label = np.argmax(pred, axis=1)[0]
    return label_encoder.inverse_transform([label])[0]

In [None]:
sample_text = "I feel like to jump off the terrace."
print("Predicted label:", predict_text_label(sample_text))

Predicted label: suicidewatch


In [None]:
sample_text = "Where does life start and where it ends, who knows!"
print("Predicted label:", predict_text_label(sample_text))

Predicted label: depression
