# Importing Libraries

In [None]:
# Data manipulation libraries
import sys, os
import pandas as pd
import numpy as np
import json
import re

# Scikit-learn packages
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Packages to define a BERT model
from transformers import TFBertModel, BertTokenizerFast, BertConfig

# Keras and TensorFlow packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dropout, Dense, LSTM, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Loading and Preprocessing

In [None]:
# Change the path according to the location of your CSV file
file_path = '/content/drive/MyDrive/Dataset for Capstone/data_full_new_30k.csv'

# Read data from a txt file and create a DataFrame
df_first = pd.read_csv(file_path, sep=';', encoding='latin1')
df_first = df_first.rename(columns={'Column1': 'Label', 'Column2': 'Text'})

# Display the first few rows of the DataFrame
df_first.head()

Unnamed: 0,Label,Text
0,anxiety,Apakah saya diadili? Jadi saya sering melakuka...
1,anxiety,Kekurangan nasional di Lorazepam telah mencoba...
2,anxiety,Minggu ini adalah omong kosong saya mengalami ...
3,anxiety,Adakah yang punya hobi/kegiatan yang membantu ...
4,anxiety,"Saya memesan sendiri hari ini, saya selalu per..."


In [None]:
print('Number of row : ', df_first.shape[0])
print('Number of column : ', df_first.shape[1])

Number of row :  30002
Number of column :  2


In [None]:
df_first.isnull().sum()

Label    0
Text     4
dtype: int64

In [None]:
# Delete rows with null or NaN values from the DataFrame
df_first = df_first.dropna()

In [None]:
df_first.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29998 entries, 0 to 30001
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   29998 non-null  object
 1   Text    29998 non-null  object
dtypes: object(2)
memory usage: 703.1+ KB


In [None]:
df_first.duplicated().sum()

4

In [None]:
#removing duplicated values
index = df_first[df_first.duplicated() == True].index
df_first.drop(index, axis = 0, inplace= True)
df_first.reset_index(inplace= True, drop= True)

In [None]:
#print the rows which are duplicated (duplicated in the text but with different emotions)
df_first[df_first['Text'].duplicated() == True]

Unnamed: 0,Label,Text


In [None]:
df_first.Label.value_counts()

anxiety       10000
depression     9998
stress         9996
Name: Label, dtype: int64

# Model Building with BERT

In [None]:
max_length = df_first.apply(lambda x: len(str(x).split())).max()
max_length

88

In [None]:
# Importing BERT pre-trained model and tokenizer
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
# function for creating BERT based model
def create_model(nb_labels):
  # Load the MainLayer
  bert = transformer_model.layers[0]

  # Build the model inputs
  input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
  attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
  token_type_ids = Input(shape=(max_length,), name='token_type_ids', dtype='int32')

  inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids}

  # BERT layer
  bert_output = bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]

  # Add dropout layers for regularization
  outputs = Dropout(0.2)(bert_output)

  # Output layer
  emotion = Dense(units=nb_labels, activation="softmax", kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='emotion')(outputs)
  outputs = emotion

  # Build the model
  model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel')

  return model

In [None]:
# Creating a model instance
model = create_model(3)

# Take a look at the model
model.summary()

Model: "BERT_MultiLabel"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 88)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 88)]                 0         []                            
 )                                                                                                
                                                                                                  
 token_type_ids (InputLayer  [(None, 88)]                 0         []                            
 )                                                                                                
                                                                                    

# Text Preprocessing and Data Preparation

In [None]:
# Splitting the data into 80% training, 10% validation, and 10% test
train_data, temp_data = train_test_split(df_first, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Creating train, validation, and test variables
X_train = train_data['Text']
y_train = train_data['Label']

X_val = val_data['Text']
y_val = val_data['Label']

X_test = test_data['Text']
y_test = test_data['Label']

In [None]:
# Tokenizing train data
train_token = tokenizer(
    text = X_train.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length',
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing valisation data
val_token = tokenizer(
    text = X_val.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length',
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing test data
test_token = tokenizer(
    text = X_test.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length',
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

In [None]:
# Creating BERT compatible inputs with Input Ids, attention masks, and token Ids
train = {'input_ids': train_token['input_ids'], 'attention_mask': train_token['attention_mask'], 'token_type_ids': train_token['token_type_ids']}
val = {'input_ids': val_token['input_ids'], 'attention_mask': val_token['attention_mask'], 'token_type_ids': val_token['token_type_ids']}
test = {'input_ids': test_token['input_ids'], 'attention_mask': test_token['attention_mask'], 'token_type_ids': test_token['token_type_ids']}

In [None]:
# Convert labels to numerical values using LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Convert numerical labels to one-hot encoding
y_train_one_hot = to_categorical(y_train_encoded, num_classes=3)
y_val_one_hot = to_categorical(y_val_encoded, num_classes=3)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=3)

In [None]:
# Creating TF tensors
train_tensor = tf.data.Dataset.from_tensor_slices((train, y_train_one_hot)).shuffle(len(train)).batch(16)
val_tensor = tf.data.Dataset.from_tensor_slices((val, y_val_one_hot)).shuffle(len(val)).batch(16)
test_tensor = tf.data.Dataset.from_tensor_slices((test, y_test_one_hot)).shuffle(len(test)).batch(16)

# Training Model with BERT

In [None]:
# Set up EarlyStopping and ReduceLROnPlateau callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

# Set an optimizer
optimizer = Adam(
    learning_rate=3.e-05,)

# Compile the model
model.compile(
    optimizer = optimizer,
    loss ='categorical_crossentropy',
    metrics=['accuracy'])

# train the model
history = model.fit(train_tensor,
                    epochs=15,
                    validation_data=val_tensor,
                    callbacks=[reduce_lr])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
 143/1500 [=>............................] - ETA: 7:00 - loss: 0.1092 - accuracy: 0.9615

# Model Evaluation and Prediction

In [None]:
# Model evaluation using test data
evaluation_results = model.evaluate(val_tensor)

# Display evaluation results
print("Loss:", evaluation_results[0])
print("Accuracy:", evaluation_results[1])

In [None]:
# Describes the new data to be predicted
new_data = ["hari ini gw stress banget fuckkk"]

# Pre-processing of new data, including tokenization and padding
new_tokenized = tokenizer(new_data, return_tensors='tf', padding=True, truncation=True, max_length=max_length)

# Make predictions using the model
predictions = model.predict({'input_ids': new_tokenized['input_ids'],
                              'attention_mask': new_tokenized['attention_mask'],
                              'token_type_ids': new_tokenized['token_type_ids']})

# Display the prediction result
for i, text in enumerate(new_data):
    print(f"Text: {text}")

    # Displays the probability for each category
    for j, category in enumerate(label_encoder.classes_):
        probability = predictions[i][j] * 100
        print(f"{category} Probability: {probability:.2f}%")

    # Determine the category with the highest probability as the prediction
    predicted_class = label_encoder.classes_[np.argmax(predictions[i])]
    print(f"Predicted Class: {predicted_class}")

    # Added if-else logic to display statements based on predictions
    if predicted_class == 'anxiety' and predictions[i][1] > 0.5:
        print("Model predicts high probability of anxiety.")
    elif predicted_class == 'depression' and predictions[i][2] > 0.5:
        print("Model predicts high probability of depression.")
    elif predicted_class == 'adhd' and predictions[i][0] > 0.5:
        print("Model predicts high probability of ADHD.")
    else:
        print("Model predicts other mental health condition.")

# Model Saving

In [None]:
# Save the model into SavedModel format
#model.save("/content/drive/MyDrive/Model/path_to_saved_model_BERT", save_format="tf")

In [None]:
# Simpan model dalam format SavedModel
#model.save("/content/drive/MyDrive/mental_health_detection_model_using_BERT")

# Unduh sebagai ZIP
#!zip -r mental_health_detection_model_using_BERT.zip /content/drive/MyDrive/mental_health_detection_model_using_BERT