In [2]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import IPython

In [3]:
## Read data dari github
data = pd.read_csv('https://raw.githubusercontent.com/novra/dts_sosmed8/main/tweets.csv')
data.head()

In [4]:
# Menghitung jumlah masing-masing target
target = list(set(data['target']))
jumlah_target = []
for i in target:
  jumlah_target.append(list(data['target']).count(i))

#visualisasi jumlah keyword
warna = np.array(['hotpink', 'cornflowerblue'])
plt.bar(target, jumlah_target, color=warna)
plt.title("Distribusi kelas target")
## Menampilkan label pada grafik
for i in range(len(target)):
    plt.text(i, jumlah_target[i], jumlah_target[i], ha = 'center')

plt.show()

In [5]:
## mendefinisikan fungsi untuk clean data
def clean_data(teks):
    # Mengubah semua huruf menjadi huruf kecil
    teks = teks.lower()
    # Menghapus www.* atau https?://*
    teks = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',teks)
    # Menghapus tanda #
    teks = re.sub(r'#([^\s]+)', r'\1', teks)
    # Menghapus tanda baca
    teks = re.sub(r'[^\w\s]',' ', teks)
    # Menghapus angka
    teks = re.sub(r'[\d-]', '', teks)
    # Menghapus spasi berlebih
    teks = re.sub('[\s]+', ' ', teks)
    # Menghapus tanda \, ', dan "
    teks = teks.strip('\'"')
    
    # Pembersihan kata
    words = teks.split()
    tokens=[]
    for ww in words:
        # Memisahkan kata berulang
        for w in re.split(r'[-/\s]\s*', ww):
            # Menghapus huruf berulang yang lebih dari dua kali
            pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
            w = pattern.sub(r"\1\1", w)
            w = w.strip('\'"?,.')
            # Memeriksa apakah suatu kata terbentuk dari minimal dua huruf
            val = re.search(r"^[a-zA-Z][a-zA-Z][a-zA-Z]*$", w)
            if w == "rt" or val is None:
                continue
            else:
                tokens.append(w.lower())
    
    teks = " ".join(tokens)  
    return teks

# clean data teks
data['text'] = data['text'].map(lambda x: clean_data(x))
teks = data[data['text'].apply(lambda x: len(x.split()) >=1)]
teks = np.array(data['text'])

#One hot encoding pada data target
target = np.array(pd.get_dummies(data['target']))

In [6]:
# Pemisahan data training & data testing
data_train,data_test,label_train,label_test = train_test_split(teks, target, test_size=0.2,
                                                               stratify=target, random_state=7)

In [7]:
from transformers import BertTokenizer, TFBertModel
bert_tokenizer = BertTokenizer.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2-v2')
bert_model = TFBertModel.from_pretrained("cross-encoder/ms-marco-TinyBERT-L-2-v2", trainable=False, from_pt=True)

In [8]:
# Pendefinisian fungsi untuk melakukan tokenisasi pada satu data
def tokenisasi(teks):
      encode_dict = bert_tokenizer(teks,
                                   add_special_tokens = True,
                                   max_length = 80,
                                   padding = 'max_length',
                                   truncation = True,
                                   return_attention_mask = True,
                                   return_tensors = 'tf',)

      tokenID = encode_dict['input_ids']
      attention_mask = encode_dict['attention_mask']

      return tokenID, attention_mask

# Pendefinisian fungsi untuk mengambil hasil tokenisasi pada semua data
def create_input(data):
    tokenID, input_mask = [], []
    for teks in data:
        token, mask = tokenisasi(teks)
        tokenID.append(token)
        input_mask.append(mask)
    
    return {'input_ids': np.asarray(tokenID, dtype=np.int32).reshape(-1, 80), 
            'attention_mask': np.asarray(input_mask, dtype=np.int32).reshape(-1, 80)}

In [9]:
# Membuat tokenID untuk X_train dan X_test
X_train = create_input(data_train)
X_test = create_input(data_test)

#Mengambil representasi teks dari encoder layer ke 12 dari model BERT
X_train = bert_model(**X_train)[0]
X_test = bert_model(**X_test)[0]

In [17]:
from keras_tuner.tuners import BayesianOptimization
#Mendefinisikan fungsi untuk klasifikasi dengan model hybrid CNN-GRU menggunakan beberapa kandidat hyperparameter
def cnn_gru(hp):
    #Input layer
    input = keras.layers.Input(shape=(80, 128))

    #Convolution layer
    cnn = keras.layers.Conv1D(filters = hp.Int('filters',
                                                min_value = 200, 
                                                max_value = 300, 
                                                step = 50),
                                  kernel_size = hp.Int('kernel_size',
                                                min_value = 3, 
                                                max_value = 5, 
                                                step = 1),
                                  activation='relu',
                                  kernel_regularizer = keras.regularizers.l2(hp.Choice('kernel_cnn',
                                                                                       values = [0.01, 0.001])))(input)

    #Max Pooling layer
    maxpool = keras.layers.MaxPooling1D(pool_size=2)(cnn)

    #GRU layer
    gru = keras.layers.GRU(units = hp.Int('units',
                                             min_value = 100,
                                             max_value = 200,
                                             step = 50),
                                      kernel_regularizer=keras.regularizers.l2(hp.Choice('kernel_regularizer',
                                                                                         values = [0.01, 0.001])),
                                      recurrent_regularizer=keras.regularizers.l2(hp.Choice('rec_regularizer',
                                                                                            values = [0.01, 0.001])))(maxpool)
    #Output layer
    output = keras.layers.Dense(2, activation='softmax',
                                kernel_regularizer=keras.regularizers.l2(hp.Choice('kernel_dense', values = [0.01, 0.001])))(gru)
    model = keras.models.Model(inputs=input, outputs=output)

    model.compile(optimizer = keras.optimizers.Adam(1e-3),
                  loss ='categorical_crossentropy',
                  metrics=['accuracy'])
   
    return model

# Pendefinisian Callback
class ClearTrainingOutput(tf.keras.callbacks.Callback):
    def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

#Hyperparameter tuning menggunakan bayesian optimization dengan
#banyak percobaan kombinasi hyperparameter sebanyak 10
tuner = BayesianOptimization(cnn_gru,
                             objective = 'val_accuracy', 
                             max_trials = 10,
                             directory = '/content/Hasil',
                             project_name = 'Sentimen-CNN-GRU',
                             overwrite = True)

tuner.search(X_train, label_train,
             batch_size=64, epochs=50,
             validation_data=(X_test, label_test),
             callbacks=[early_stop, ClearTrainingOutput()])

# Mendapatkan model terbaik dari 10 percobaan bayesian
model = tuner.get_best_models()[0]

In [18]:
# Retrain model pada data test
history = model.fit(X_train, label_train,
                    batch_size=32, epochs=50,
                    validation_data=(X_test, label_test),
                    callbacks=[early_stop])

In [19]:
# Mendapatkan kinerja model
y_pred = np.argmax(model.predict(X_test), axis=1)
y = np.argmax(label_test, axis=1)
print('accuracy: ', accuracy_score(y, y_pred),
      '\nprecicion: ', precision_score(y, y_pred),
      '\nrecall: ', recall_score(y, y_pred))

In [20]:
# plot grafik akurasi
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# plot grafik loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()