# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D, Flatten, Dropout, BatchNormalization, Input

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

#Transfer Learning
from transformers import AutoTokenizer, TFBertModel

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Loading and Preprocessing

In [None]:
# Change the path according to the location of your CSV file
file_path = '/content/drive/MyDrive/Dataset for Capstone/dataset.csv'

# Read data from a csv file and create a DataFrame
df_first = pd.read_csv(file_path, sep=';', encoding='latin1')
df_first = df_first.rename(columns={'Column1': 'Label', 'Column2': 'Text'})

# Display the first few rows of the DataFrame
df_first.head()

Unnamed: 0,Label,Text
0,adhd,Berapa jauh di depan saya sekarang? Ventilasi ...
1,adhd,ADHD Bipolar ada orang lain yang memiliki gang...
2,adhd,Hubungan saya berantakan dan saya tidak tahu h...
3,adhd,Bagi mereka yang berjuang untuk menulis kertas...
4,adhd,Meds tidak menyembuhkan kemalasan kronis tetap...


In [None]:
print('Number of row : ', df_first.shape[0])
print('Number of column : ', df_first.shape[1])

Number of row :  59996
Number of column :  2


In [None]:
df_first.isnull().sum()

Label    0
Text     0
dtype: int64

In [None]:
df_first.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59996 entries, 0 to 59995
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   59996 non-null  object
 1   Text    59996 non-null  object
dtypes: object(2)
memory usage: 937.6+ KB


In [None]:
df_first.duplicated().sum()

5

In [None]:
#removing duplicated values
index = df_first[df_first.duplicated() == True].index
df_first.drop(index, axis = 0, inplace= True)
df_first.reset_index(inplace= True, drop= True)

In [None]:
#print the rows which are duplicated (duplicated in the text but with different emotions)
df_first[df_first['Text'].duplicated() == True]

Unnamed: 0,Label,Text


In [None]:
df_first.Label.value_counts()

adhd          19999
anxiety       19997
depression    19995
Name: Label, dtype: int64

# Text Preprocessing and Data Preparation

In [None]:
vocab_size = 5000
embedding_dim = 32
max_length = 250
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Divide the data into training (80%), validation (10%), and test (10%)
training_sentences, temp_sentences, training_labels, temp_labels = train_test_split(df_first['Text'], df_first['Label'], train_size=0.8, shuffle=True)
# Split validation and test data (50% of remaining data each)
validation_sentences, test_sentences, validation_labels, test_labels = train_test_split(temp_sentences, temp_labels, train_size=0.5, shuffle=True)

In [None]:
# Fit your tokenizer with training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

# Convert text to sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Pad sequences
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type,truncating=trunc_type)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Tokenizer to encode label.
label_tokenizer = LabelEncoder()
label_tokenizer.fit(df_first['Label'])

# Encode labels using label tokenizer
training_label_seq = np.array(label_tokenizer.transform(training_labels))
validation_label_seq = np.array(label_tokenizer.transform(validation_labels))
testing_label_seq = np.array(label_tokenizer.transform(test_labels))

# Model Building and Training

In [None]:
def learning_rate_scheduler(epoch):
  return 0.001

lr_scheduler= LearningRateScheduler(learning_rate_scheduler)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
# Build the rest of our model
model = Sequential([Embedding(vocab_size, embedding_dim, input_length=max_length),
                    Bidirectional(LSTM(64, return_sequences=True)),
                    GlobalAveragePooling1D(),
                    Dropout(0.5),
                    Flatten(),
                    Dense(64, activation='relu'),
                    Dropout(0.5),
                    BatchNormalization(),
                    Dense(32, activation='relu'),
                    Dropout(0.5),
                    Dense(3, activation='softmax')
                    ])

# Make sure you are using "sparse_categorical_crossentropy" as a loss fuction
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history= model.fit(training_padded, training_label_seq, epochs=10,validation_data=(validation_padded, validation_label_seq),
                   callbacks=[early_stopping, lr_scheduler])
#history= model.fit(training_padded, training_label_seq, epochs=7,validation_data=(validation_padded, validation_label_seq),
#                   callbacks=[lr_scheduler])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Model Evaluation and Prediction

In [None]:
# Model evaluation using test data
evaluation_results = model.evaluate(validation_padded, validation_label_seq)

# Display evaluation results
print("Loss:", evaluation_results[0])
print("Accuracy:", evaluation_results[1])

Loss: 0.3403307795524597
Accuracy: 0.8824804425239563


In [None]:
# Describes the new data to be predicted
new_data = ["Merasa seperti ada beban berat di dadaku akhir-akhir ini. Kecemasan selalu mengintai, membuat jantung berdegup kencang dan pikiran jadi gelisah. Sulit untuk tenang dan merasa aman. Semoga suatu hari saya bisa menemukan cara untuk menghadapi dan mengatasi kecemasan ini, serta mendapatkan dukungan yang diperlukan."]

# Pre-processing of new data, including tokenization and padding
new_sequences = tokenizer.texts_to_sequences(new_data)
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Make predictions using the model
predictions = model.predict(new_padded)

# Display the prediction result
for i, text in enumerate(new_data):
    print(f"Text: {text}")

    # Displays the probability for each category
    for j, category in enumerate(label_tokenizer.classes_):
        probability = predictions[i][j] * 100
        print(f"{category} Probability: {probability:.2f}%")

    # Determine the category with the highest probability as the prediction
    predicted_class = label_tokenizer.classes_[np.argmax(predictions[i])]
    print(f"Predicted Class: {predicted_class}")

    # Added if-else logic to display statements based on predictions
    if predicted_class == 'anxiety' and predictions[i][1] > 0.5:
        print("Model predicts high probability of anxiety.")
    elif predicted_class == 'depression' and predictions[i][2] > 0.5:
        print("Model predicts high probability of depression.")
    elif predicted_class == 'adhd' and predictions[i][0] > 0.5:
        print("Model predicts high probability of ADHD.")
    else:
        print("Model predicts other mental health condition.")

Text: Merasa seperti ada beban berat di dadaku akhir-akhir ini. Kecemasan selalu mengintai, membuat jantung berdegup kencang dan pikiran jadi gelisah. Sulit untuk tenang dan merasa aman. Semoga suatu hari saya bisa menemukan cara untuk menghadapi dan mengatasi kecemasan ini, serta mendapatkan dukungan yang diperlukan.
adhd Probability: 1.38%
anxiety Probability: 96.43%
depression Probability: 2.19%
Predicted Class: anxiety
Model predicts high probability of anxiety.


# Model Saving

In [None]:
# Save the model into SavedModel format
model.save("/content/drive/MyDrive/Model/path_to_saved_model", save_format="tf")