# **Algoritma deep learning dengan non-contextual word embedding**

### Ruhiyah Faradishi Widiaputri
### 13519034

#### **1) Preprocessing**

In [1]:
# import all needed libraries
import re
import math
import os

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import FastText
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import *
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

!pip install -q -U keras-tuner
import keras_tuner as kt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[K     |████████████████████████████████| 135 kB 5.9 MB/s 
[K     |████████████████████████████████| 1.6 MB 49.8 MB/s 
[?25h

In [2]:
# read train file
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0.1,Unnamed: 0,text_a,label
0,0,betewe buka twitter cuman ngetweet liat home b...,no
1,1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,no
2,2,e100ss gini buka informasi sejelas nya identit...,yes
3,3,neng solo wes ono terduga corona cobo neng ati...,no
4,4,midiahn nii akun gak takut takut nya isu coron...,no


In [3]:
# get all labels
labels = df_train.label.unique()
labels_dict = {}
for i in range (len(labels)):
  labels_dict[labels[i]] = i

print(labels_dict)

{'no': 0, 'yes': 1}


In [4]:
# cleaning data

ina_stopwords = [
    'yg', 'yang', 'dalam', 'dlm', 'dgn', 'dengan', 'dan', 'atau' , 'aja', 'aku', 'gw', 'ku', 'kalo'
]

def clear_str(x):
  # lower text
  x = x.lower()

  # delete punctuation
  x = re.sub(r'[^\w\s]', '', x)

  # delete any number
  x = re.sub(r'[0-9]', '', x)

  # remove stopwords
  for sw in ina_stopwords:
        x = x.replace(sw, '') 

  # mask some entities
  x = re.sub(r'(a)*(wk|kw)+[wk]*', 'laugh', x)
  x = re.sub(r"(https?|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", x)

  # return
  return(x)

df_train['text_a'] = df_train['text_a'].apply(clear_str)
df_train.head()

Unnamed: 0.1,Unnamed: 0,text_a,label
0,0,betewe buka twitter cuman ngetweet liat home b...,no
1,1,mas piyuuu mugo corona tuh mulut tersumpal ma ...,no
2,2,ess gini buka informasi sejelas nya identitas ...,yes
3,3,neng solo wes ono terduga corona cobo neng ati...,no
4,4,midiahn nii n gak tt tt nya isu corona laugh,no


In [5]:
# tokenize data
max_features = 30000

tokenizer = Tokenizer(num_words = max_features)

#create the vocabulary index based on word frequency
tokenizer.fit_on_texts(df_train['text_a'].values)


word_index = tokenizer.word_index

print(word_index)
print(df_train['text_a'].values[:5])

['betewe buka twitter cuman ngetweet liat home berita corona panik kepikiran ndamau buka home  aware  i ll stay at home nda rumah  nda penting banget'
 'mas piyuuu mugo corona tuh mulut tersumpal ma corona'
 'ess gini buka informasi sejelas nya identitas daerah penderita terjangkit infokan masyarakat mengisolasi nya kontak langsung penderita positif corona ditutup tutupi'
 'neng solo wes ono terduga corona cobo neng ati mu neng conora'
 'midiahn nii n gak tt tt nya isu corona laugh']


In [6]:
# get sentences length
df_train['l'] = df_train["text_a"].apply(lambda x: len(str(x).split(' ')))

print("mean length of sentence: " + str(df_train['l'].mean()))
print("max length of sentence: " + str(df_train['l'].max()))
print("std dev length of sentence: " + str(df_train['l'].std()))

mean length of sentence: 15.61811953150317
max length of sentence: 1239
std dev length of sentence: 16.603255576586548


In [7]:
# there must be an outlier in training data since data.max() >>> data.mean()
# so we use high_outlier_limit for sequence length in our model
q1 = df_train['l'].quantile(.25)
q3 = df_train['l'].quantile(.75)
iqr = q3 - q1
high_outlier_limit = math.floor(q3 + 1.5*iqr)

sequence_length = high_outlier_limit
print(sequence_length)

40


In [8]:
def prepare_x_y(df):
  """ returns list of (x,y) from the given dataframe
  x is sequences of integers, which is generated by mapping every words in df['text_a'] to its correspondent vocabulary index
  y is the labels
  """
  # clean data
  df['text_a'] = df['text_a'].apply(clear_str)

  # tokenize sentence
  x = tokenizer.texts_to_sequences(df['text_a'].values)

  # padding sequence so that every sentence has same length
  x = pad_sequences(x, sequence_length)
  
  # get label
  # y = np.array([labels_dict[l] for l in df['label'].values])
  y = pd.get_dummies(df['label']).values

  # return
  return (x,y)

In [9]:
# get X_train and y_train with function prepare_x_y above
X_train, y_train = prepare_x_y(df_train)

print(X_train[:5])
print(y_train[:5])

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0 13661   123   510   600  6652   170   725    57     1
    115  1648 13662   123   725  1541   160  6653   245   234   725  2704
     31  2704  5085    62]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0   147   795  5086     1   120
   1025 13663   484     1]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0  1096   108   123   272  9707
      2  1937   102  1590   415  6654    33  4583     2   625   124  1590
     26     1   566  2563]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0   843   780  1097  1435  3544     1  9708   843
    967   111   843  3088]
 [  

## Training word embedding
the word embedding we use is FastText

In [10]:
# train FastText with train data

# tokenize each sentence in df_train['text_a']
tokenized_text = [word_tokenize(text.lower()) for text in df_train['text_a']]

# embedding dimension
embedding_dim = 100

# train FastText
word_embedding_model = FastText(sentences=tokenized_text, size=embedding_dim, window=5, min_count=3, iter=100)

In [11]:
# save FastText model
os.makedirs('models/', exist_ok = True)
word_embedding_model.save("models/fasttext_we.fasttext")

In [12]:
# get word vector
word_embedding = word_embedding_model.wv

In [13]:
# Sanity check
# one way that we can for checking the sanity of our model is by using familiar word, 
# for example we will se which words are closest to the word "jakarta" and "presiden"
print("jakarta : ")
print(word_embedding.similar_by_word("jakarta",topn=15))
print("presiden : ")
print(word_embedding.similar_by_word("presiden",topn=15))

jakarta : 
[('tfjakarta', 0.937827467918396), ('bojakarta', 0.8926382064819336), ('psbbjakarta', 0.8774242997169495), ('dijakarta', 0.83994460105896), ('purwakarta', 0.7875755429267883), ('jogjakarta', 0.7647812962532043), ('dkijakarta', 0.7425299882888794), ('karta', 0.726804792881012), ('transjakarta', 0.7053824663162231), ('layananjakarta', 0.682169497013092), ('warikarta', 0.6674696207046509), ('yogyakarta', 0.6437135338783264), ('jakartatanggapcorona', 0.6432783603668213), ('jkrta', 0.6356988549232483), ('tundabalikkejakarta', 0.6317126750946045)]
presiden : 
[('president', 0.9087907671928406), ('presidenkopi', 0.8334130644798279), ('presidennya', 0.8302711248397827), ('presidenjoko', 0.827377438545227), ('presidentuhurukenyatta', 0.7347064018249512), ('kepresidenan', 0.6712084412574768), ('ekspresi', 0.6099710464477539), ('jlaughpresidengakberguna', 0.5981332063674927), ('pres', 0.5723455548286438), ('menterinya', 0.5119149684906006), ('press', 0.5040663480758667), ('pretiwn', 0.

In [None]:
# from the result above, we can see that our model is pretty good

## Create embedding matrix
embedding matrix is a matrix that maps words in vocabulary (word_index) into their word embedding. Tis matrix will be used later in encoding layer of our text classification layer

In [14]:
# get num_words
num_words = min(max_features, len(word_index)) + 1
print(num_words)

# hapus nanti tlg
embedding_dim = 100

# create embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    try:
      embedding_vector = word_embedding[word]
      embedding_matrix[i] = embedding_vector
    except:
      embedding_matrix[i] = np.random.randn(embedding_dim)

30001


### Create Text Classification Model

In [15]:
# read validation/ dev data
df_dev = pd.read_csv("dev.csv")
df_dev.head()

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes


In [16]:
# get X_dev and y_dev
X_dev, y_dev = prepare_x_y(df_dev)

Model 1

In [18]:
# create model builder
def model_builder(hp):
  model = Sequential()
  # embedding layer
  model.add(Embedding(num_words,
                      embedding_dim,
                      embeddings_initializer=Constant(embedding_matrix),
                      input_length=sequence_length,
                      trainable=True))
  # LTSM and dropout layers
  do_val_1 = hp.Float('do_val_1', min_value = 0.2, max_value=0.5, step = 0.1)
  model.add(Dropout(do_val_1))

  ltsm_u_1 = hp.Int('ltsm_u_1', min_value=32, max_value=256, step=32)
  model.add(LSTM(ltsm_u_1, return_sequences=True))

  ltsm_u_2 = hp.Int('ltsm_u_2', min_value=32, max_value=256, step=32)
  model.add(LSTM(ltsm_u_2))

  do_val_2 = hp.Float('do_val_2', min_value = 0.2, max_value=0.5, step = 0.1)
  model.add(Dropout(do_val_2))

  # output layer
  model.add(Dense(units=2, activation='sigmoid'))

  # model compilation
  learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  model.compile(optimizer=Adam(learning_rate=learning_rate),
                loss = 'binary_crossentropy', 
                metrics = ['accuracy'])

  return model


In [19]:
# instantiate the tuner and perform hypertuning
tuner = kt.RandomSearch(model_builder,
                     objective='val_accuracy',
                     max_trials=5,
                     executions_per_trial = 3,
                     directory='my_dir',
                     project_name='hyperparam_tuning')

# show search space summary
tuner.search_space_summary()

Search space summary
Default search space size: 5
do_val_1 (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': None}
ltsm_u_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
ltsm_u_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
do_val_2 (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': None}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [20]:
# do hyperparameter tuning
tuner.search(X_train, y_train, epochs=5, validation_data = (X_dev, y_dev))

Trial 5 Complete [00h 35m 08s]
val_accuracy: 0.8694047729174296

Best val_accuracy So Far: 0.8694047729174296
Total elapsed time: 02h 09m 34s


In [23]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Best do_val_1 = {best_hps.get('do_val_1')}")
print(f"Best do_val_2 = {best_hps.get('do_val_2')}")
print(f"Best ltsm_u_1  = {best_hps.get('ltsm_u_1')}")
print(f"Best ltsm_u_2  = {best_hps.get('ltsm_u_2')}")
print(f"Best learning_rate  = {best_hps.get('learning_rate')}")

Best do_val_1 = 0.5000000000000001
Best do_val_2 = 0.30000000000000004
Best ltsm_u_1  = 160
Best ltsm_u_2  = 160
Best learning_rate  = 0.01


In [47]:
# create model 1 based on its best parameter tuning
BATCH_SIZE = 128
FIT_EPOCHS = 20

# model 1
model1 = tuner.hypermodel.build(best_hps)
model1.fit(X_train, y_train, epochs=FIT_EPOCHS, batch_size=BATCH_SIZE, verbose=1, validation_data=(X_dev, y_dev))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe494154290>

Model 2

In [32]:
# create model builder 2
def model_builder_2(hp):
  model = Sequential()
  # embedding layer
  model.add(Embedding(num_words,
                      embedding_dim,
                      embeddings_initializer=Constant(embedding_matrix),
                      input_length=sequence_length,
                      trainable=True))
  
  # LTSM layer
  ltsm_u_1 = hp.Int('ltsm_u_1', min_value=32, max_value=256, step=32)
  model.add(LSTM(ltsm_u_1))

  # output layer
  model.add(Dense(units=2, activation='sigmoid'))

  # model compilation
  learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  model.compile(optimizer=Adam(learning_rate=learning_rate),
                loss = 'binary_crossentropy', 
                metrics = ['accuracy'])

  return model

In [37]:
# instantiate the tuner and perform hypertuning
tuner2 = kt.RandomSearch(model_builder_2,
                     objective='val_accuracy',
                     max_trials=5,
                     executions_per_trial = 3,
                     directory='my_dir2',
                     project_name='hyperparam_tuning2')

# show search space summary
tuner2.search_space_summary()

Search space summary
Default search space size: 2
ltsm_u_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [38]:
# do hyperparameter tuning
tuner2.search(X_train, y_train, epochs=3, validation_data = (X_dev, y_dev))

Trial 5 Complete [00h 10m 14s]
val_accuracy: 0.8495238224665324

Best val_accuracy So Far: 0.8659523725509644
Total elapsed time: 00h 42m 57s


In [39]:
# Get the optimal hyperparameters
best_hps2=tuner2.get_best_hyperparameters(num_trials=1)[0]

print(f"Best ltsm_u_1  = {best_hps2.get('ltsm_u_1')}")
print(f"Best learning_rate  = {best_hps2.get('learning_rate')}")

Best ltsm_u_1  = 128
Best learning_rate  = 0.001


In [49]:
# create model 2 based on its best parameter tuning

# model 2
model2 = tuner2.hypermodel.build(best_hps2)
history2 = model2.fit(X_train, y_train, epochs=FIT_EPOCHS, batch_size=BATCH_SIZE, verbose=1, validation_data=(X_dev, y_dev))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Testing

In [50]:
# read test data
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes


In [56]:
# predict
X_test, y_test = prepare_x_y(df_test)

# model1 
y_hat1 = model1.predict(X_test)
y_hat1 = np.rint(y_hat1)

# model2 
y_hat2 = np.rint(model2.predict(X_test))

In [57]:
# show some of their results
print(y_hat1[:10])
print(y_hat2[:10])

[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [58]:
# show accuracy, precision, recall, and confusion matrix
def show_metrics(y_hat):
  # accuracy
  print(f"Accuracy : {accuracy_score(y_hat, y_test)}")

  y_t =np.argmax(y_test, axis=1)
  y_h =np.argmax(y_hat, axis=1)

  # precision
  print(f"Precision : {precision_score(y_h, y_t)}")

  # recall
  print(f"Recall : {recall_score(y_h, y_t)}")

  # confusion matrix
  print("Confusion matrix : ")
  print(confusion_matrix(y_h, y_t))

In [59]:
# show model 1
print("MODEL 1")
show_metrics(y_hat1)

MODEL 1
Accuracy : 0.8478571428571429
Precision : 0.7482319660537482
Recall : 0.6808236808236808
Confusion matrix : 
[[1845  178]
 [ 248  529]]


In [60]:
# model 2
print("MODEL 2")
show_metrics(y_hat2)

MODEL 2
Accuracy : 0.8582142857142857
Precision : 0.6803394625176803
Recall : 0.7480559875583204
Confusion matrix : 
[[1931  226]
 [ 162  481]]


### Perbandingan model dengan dan tanpa menggunakan word embedding
Karena dari hasil testing bisa dikatakan model 2 lebih baik dibanding model 1, maka sekarang kita akan mencoba membandingkan bagaimana pengaruh word embedding dalam pengembangan model klasifikasi teks

In [62]:
# baseline model 2
model_bs2 = Sequential()

# embedding layernya jadi tidak ada
  
# LTSM layer
ltsm_u_1 = 128
model_bs2.add(LSTM(ltsm_u_1, input_shape=(sequence_length, 1)))

# output layer
model_bs2.add(Dense(units=2, activation='sigmoid'))

# model compilation
learning_rate = 0.001
model_bs2.compile(optimizer=Adam(learning_rate=learning_rate),
                loss = 'binary_crossentropy', 
                metrics = ['accuracy'])

model_bs2.summary()


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_11 (LSTM)              (None, 128)               66560     
                                                                 
 dense_7 (Dense)             (None, 2)                 258       
                                                                 
Total params: 66,818
Trainable params: 66,818
Non-trainable params: 0
_________________________________________________________________


In [63]:
history_bs2 = model_bs2.fit(X_train, y_train, epochs=FIT_EPOCHS, batch_size=BATCH_SIZE, verbose=1, validation_data=(X_dev, y_dev))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [64]:
# test model_bs2 with test data
y_hat_bs2 = np.rint(model_bs2.predict(X_test))

# show the result
print("MODEL BASELINE 2")
show_metrics(y_hat_bs2)

MODEL BASELINE 2
Accuracy : 0.7489285714285714
Precision : 0.4314002828854314
Recall : 0.5341506129597198
Confusion matrix : 
[[1827  402]
 [ 266  305]]
