In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import callbacks
from keras.models import Sequential

import matplotlib.pyplot as plt
from sklearn import metrics
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

from tensorflow.keras.layers import BatchNormalization

import keras
from keras import *
from keras.layers import *
from pythainlp import word_tokenize
from pythainlp.word_vector import *
from tensorflow.keras import regularizers
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder


from imblearn.over_sampling import SMOTE

from pythainlp import word_vector

import warnings
warnings.filterwarnings('ignore')

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_accuracy', 
    patience=8, 
    min_delta=0.001, 
    mode='max'
)

In [None]:
df = pd.read_csv('main_suicidal_data.csv').drop(columns='Unnamed: 0')

In [None]:
wModel = word_vector.WordVector(model_name="thai2fit_wv").get_model()
thai2dict = {}
for word in wModel.index_to_key:
    thai2dict[word] = wModel[word]
thai2vec = pd.DataFrame.from_dict(thai2dict,orient='index')
thVocab = thai2vec.index.to_list()

In [None]:
ll = len(thai2vec)
for vidx in range(ll):
    if vidx % 100 == 0:
        print('\r' + str(vidx),end='')
    aa = thai2vec.iloc[[vidx]]
    ab = aa.values.tolist()
    if vidx == 0:
        vect = ab
    else:
        vect = np.vstack((vect,ab))

print("\n", vect.shape)

In [None]:
def tokenWord(wordTarget):
    wordToken = word_tokenize(wordTarget, engine='attacut')
    return wordToken

def convWord(cw):
    cWord = cw
    for ti in range(len(cWord)):
        if cWord[ti] == ' ':
            cWord[ti] = ''
        elif cWord[ti] not in thVocab:
            cWord[ti] = ''
    return cWord

def token2index(t2idx):
    w2index = []
    for wi in range(len(t2idx)):
        if t2idx[wi] in thVocab:
            w2index.append(thVocab.index(t2idx[wi]))
    return np.array(w2index)

def findMaxArray(fma):
    maxlen = 0
    for mi in range(len(fma)):
        nA = len(fma[mi])
        if nA > maxlen:
            maxlen = nA
    return maxlen

def fill0in(f0i):
    fMax = findMaxArray(f0i)
    for fi, ax in enumerate(f0i):
        if len(ax) < fMax:
            f0i[fi] = np.hstack((ax , np.zeros(fMax-len(ax))))
        f0i1 = np.array(f0i)
    return f0i1

def prepare2train(ipt):
    pre2t = []
    for pidx in range(len(ipt)):
        wp1 = ipt[pidx]
        pre2t.append(token2index(convWord(tokenWord(wp1))))
    return pre2t

In [None]:
df['Label_Enc'] = df['Label (Specialist)'].str.replace('Level 1','Low Level Depress')
df['Label_Enc'] = df['Label_Enc'].str.replace('Level 2','Mid Level Depress')
df['Label_Enc'] = df['Label_Enc'].str.replace('Level 3','Mid Level Depress')
df['Label_Enc'] = df['Label_Enc'].str.replace('Level 4','High Level Depress')
df['Label_Enc'] = df['Label_Enc'].str.replace('Level 5','High Level Depress')

conditions = [
    (df['Label_Enc'] == 'Other'),
    (df['Label_Enc'] == 'Low Level Depress'),
    (df['Label_Enc'] == 'Mid Level Depress'),
    (df['Label_Enc'] == 'High Level Depress'),
    ]

values = ['0', '1', '2', '3']

df['Label_Enc'] = np.select(conditions, values)


In [None]:
X = df['Tweet']
X_arr = X.to_list()
y = df['Label_Enc']

In [None]:
le = LabelEncoder()
le.fit(y)

In [None]:
X = prepare2train(X_arr)
y = le.transform(y)

In [None]:
y = np.array(y)
X = fill0in(X)

In [None]:
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)
    
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1)


# LSTM

In [None]:
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim=51358,output_dim=300,name='embed'))

#Layer 2: LSTM layer
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(rate=0.6))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(rate=0.4))
#Layer 5: Output layer
model.add(Dense(4, activation = 'softmax'))
model.summary()

In [None]:
from tensorflow.keras.optimizers import SGD
opt = SGD(learning_rate=0.001,momentum=0.9,nesterov=True)
adam_opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy',optimizer=adam_opt,metrics=['accuracy'])

In [None]:
model.get_layer('embed').set_weights([vect])
model.get_layer('embed').trainable = True


In [25]:
history = model.fit(X_train,y_train,epochs=20,batch_size=32,callbacks=[early_stopping],validation_split=0.1)



KeyboardInterrupt: 

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

def plot_metric(history, metric):
    train_metrics = history.history[metric]
    val_metrics = history.history['val_'+metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics)
    plt.plot(epochs, val_metrics)
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_"+metric, 'val_'+metric])
    plt.show()


In [None]:
plot_metric(history, 'loss')

In [None]:
model_acc = model.evaluate(X_test,y_test)

In [None]:
y_pred = np.argmax(model.predict(X_test),axis=1)

In [None]:
print(classification_report(y_test, y_pred))