In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', None) #display the full text otherwise the text will be truncated
data = pd.read_csv('Suicide_Detection.csv', index_col = [0]) # assign the first column as index
df = data.sample(n=2000, random_state=42) # sampling 50000 rows

In [3]:
# encode the labels
# Define a mapping for unique values
value_mapping = {'suicide': 0, 'non-suicide': 1}
# Use the map function to apply the mapping to the column
df['class'] = df['class'].map(value_mapping)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['class'], test_size=0.2, random_state=42)

In [5]:
X_train = X_train.tolist()
X_val = X_val.tolist()

In [6]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(X_train, return_tensors="np", padding=True,truncation=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = np.array(y_train)  # Label is already an array of 0 and 1

In [7]:
tokenized_data_val = tokenizer(X_val, return_tensors="np", padding=True,truncation=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data_val = dict(tokenized_data_val)

labels_val = np.array(y_val)

In [8]:
tokenized_data

{'input_ids': array([[  101,  1790,   112, ...,     0,     0,     0],
        [  101,  1247,   112, ...,     0,     0,     0],
        [  101,  1139,  4153, ...,     0,     0,     0],
        ...,
        [  101,   146,  1276, ...,     0,     0,     0],
        [  101,   146,  1198, ...,  7299,  1143,   102],
        [  101, 10259, 23926, ...,     0,     0,     0]]),
 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0]])}

In [9]:
type(tokenized_data)

dict

In [10]:
labels

array([0, 0, 1, ..., 1, 0, 1])

In [11]:
print(len(tokenized_data),len(labels))

3 1600


In [13]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5),metrics='accuracy')  # No loss argument!

In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='model_checkpoint.h5', save_best_only=True)

In [None]:
hist = model.fit(tokenized_data, labels, validation_data=(tokenized_data_val,labels_val),epochs=5, callbacks=checkpoint_callback)
# Kernal restarting issue on home PC, works for iMAC, very costy

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
10/50 [=====>........................] - ETA: 1:55:09 - loss: 0.0225 - accuracy: 0.9937

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,3))
ax[0].plot(hist.history['loss'],color='teal',label='loss')
ax[0].plot(hist.history['val_loss'],color='orange',label='val_loss')
ax[0].set_title('Loss',fontsize=20)
ax[0].legend(loc='lower left')

ax[1].plot(hist.history['categorical_accuracy'],color='teal',label='categorical_accuracy')
ax[1].plot(hist.history['val_categorical_accuracy'],color='orange',label='val_categorical_accuracy')
ax[1].set_title('Accuracy',fontsize=20)
ax[1].legend(loc='lower right')
plt.show()