In [1]:
import numpy as np
import pandas as pd

In [3]:
df_final = pd.read_csv('./cleaned_data_txt.csv')
df_final.columns

Index(['text', 'target'], dtype='object')

In [4]:
texts = df_final['text'].astype(str).tolist()
labels = df_final['target'].tolist()
texts = np.array(texts)
labels = np.array(labels)

In [5]:
from sklearn.model_selection import train_test_split
# assume X is your data features and y is your target variable

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# define the text data and their corresponding labels
# texts = ['This movie is great', 'The movie was terrible', 'The acting was amazing', 'I did not like the film']
# labels = [1, 0, 1, 0]  # 1 for positive sentiment, 0 for negative sentiment

# create a tokenizer and fit it to the text data
tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)

# convert the text data to sequences and pad them to have the same length
sequences = tokenizer.texts_to_sequences(X_train)
maxlen = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post')

# define the model architecture
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=maxlen),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# train the model
model.fit(padded_sequences, y_train, epochs=10)

2023-03-09 20:47:10.770769: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-09 20:47:11.090722: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-09 20:47:11.158148: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-09 20:47:11.158171: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

Epoch 1/10
Epoch 2/10
Epoch 3/10

In [7]:
# evaluate the model on new data
new_texts = ['thing puts smile face brother hes best', 'anxiety literally ga kill someone anyone help pls mens sleep drinking water help', 'I am so depressed kill ', 'life is beautiful', 'lets play in the sun','i am very much scared of medha']
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=maxlen, padding='post')
predictions = model.predict(new_padded_sequences)
for i, prediction in enumerate(predictions):
    sentiment = 'depressed' if prediction >= 0.5 else 'not depressed'
    print(f"{new_texts[i]} -> {sentiment} ({prediction[0]})")
#bracket represent probability of class 1(depressed). if prob > 0.5, its classified as depressed, else not.

thing puts smile face brother hes best -> not depressed (0.002152541186660528)
anxiety literally ga kill someone anyone help pls mens sleep drinking water help -> depressed (0.9907008409500122)
I am so depressed kill  -> depressed (0.9696369767189026)
life is beautiful -> not depressed (0.01041035819798708)
lets play in the sun -> not depressed (0.033363226801157)
i am very much scared of medha -> depressed (0.7482262849807739)


In [9]:
# evaluate the model on test data
y_pred = []
new_sequences = tokenizer.texts_to_sequences(X_test)
new_padded_sequences = pad_sequences(new_sequences, maxlen=maxlen, padding='post')
predictions = model.predict(new_padded_sequences)
for i, prediction in enumerate(predictions):
    output = 1 if prediction >= 0.5 else 0
    y_pred.append(output)


from sklearn.metrics import accuracy_score

# assume y_true and y_pred are your true and predicted labels, respectively

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9291784702549575


In [14]:
print(len(y_test))
print(np.count_nonzero(y_test == 1))
print(np.count_nonzero(y_test == 0))

353
194
159
