In [1]:
!pip install sastrawi
!pip install --upgrade gensim
# !pip install --upgrade keras
# !pip install --upgrade tensorflow
!pip install keras_metrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import

In [2]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:

from typing import List

In [4]:
import re
import numpy as np
import pandas as pd

In [5]:
from nltk import pos_tag
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.sparse import hstack
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from gensim.models import Word2Vec, FastText
from keras.models import Sequential
from keras_metrics import precision, recall
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow import stack

# Load Training Data

In [6]:
df_train = pd.read_csv('train.csv')[['text_a','label']] # drop row numbers column
df_train["label"] = df_train["label"].map({"yes":1, "no":0})
df_test = pd.read_csv('test.csv')
df_test["label"] = df_test["label"].map({"yes":1, "no":0})
df_validate = pd.read_csv('dev.csv')

# Stopword Definition

In [7]:
# add stopwords
add_stopwords = set(StopWordRemoverFactory().get_stop_words())
print('sastrawi stopwords:', len(add_stopwords))

stopwords_set = set(stopwords.words())
print('nltk stopwords:', len(stopwords_set))
stopwords_set = stopwords_set.union(add_stopwords)
print('nltk added stopwords:', len(stopwords_set))

sastrawi stopwords: 123
nltk stopwords: 9380
nltk added stopwords: 9395


# Function to preprocess/normalize text

In [8]:
stemmer = StemmerFactory().create_stemmer()

In [9]:
def normalize_text(string: str, stem: bool=True, sw_elim: bool=True) -> List[str]:
  # filtering, only characters allowed
  filtered = re.sub('[^a-zA-Z]', ' ', string)
  # lower-cased and stemmed using Sastrawi
  stemmed = stemmer.stem(filtered) if stem else filtered.lower()
  # tokenize stemmed string
  tokenized = word_tokenize(stemmed)
  # eliminate stopwords
  res = [word for word in tokenized if word not in stopwords_set] if sw_elim else tokenized
  return res

# Proprocess DataFrame

In [10]:
df_train.text_a = df_train.text_a.apply(normalize_text, args=(False, True))
df_test.text_a = df_test.text_a.apply(normalize_text, args=(False, True))
df_validate.text_a = df_validate.text_a.apply(normalize_text, args=(False, True))

df_train = df_train[df_train['text_a'].map(len) > 0]

X_train = df_train.text_a
y_train = df_train.label
X_test = df_test.text_a
y_test = df_test.label
X_validate = df_validate.text_a
y_validate = df_validate.label

# Utility Function

In [11]:
def vectorize(tokenized_data, word_vector):
    vectorized = []
    for sentence in tokenized_data:
        if len(sentence) == 0:
            continue
        sentvec = []
        for w in sentence:
            if w in word_vector.index_to_key:
                sentvec.append(word_vector[w])
            else:
                sentvec.append(np.zeros((60)))
        vectorized.append(sentvec)
    return vectorized

In [12]:
max_length = max([len(i) for i in X_train])

def padding(vec):
    counter = 0
    padded = []
    for i, v in enumerate(vec):
        counter += 1
        vector = []
        if len(v) < max_length:
            pad_count = max_length - len(v)
            pad = np.zeros((pad_count, 60))
            vector = np.append(v, pad, axis=0)
        else:
            vector = v[:max_length]
        padded.append(vector)
    return padded

# Word2Vec

In [13]:
model = Word2Vec(X_train, vector_size=60, window=6, epochs=30, seed=1)
word2vec = model.wv

In [14]:
X_train_word2vec = vectorize(X_train, word2vec)
X_test_word2vec = vectorize(X_test, word2vec)
X_validate_word2vec = vectorize(X_validate, word2vec)

In [15]:
X_train_word2vec = padding(X_train_word2vec)
X_test_word2vec = padding(X_test_word2vec)
X_validate_word2vec = padding(X_validate_word2vec)

In [16]:
X_train_word2vec = np.array(X_train_word2vec)
X_test_word2vec = np.array(X_test_word2vec)
X_validate_word2vec = np.array(X_validate_word2vec)

# RNN Word2Vec

In [17]:
model_word2vec = Sequential([
    LSTM(200, input_shape=(X_train_word2vec.shape[1],X_train_word2vec.shape[2]),return_sequences=True),
    LSTM(100, activation="sigmoid",return_sequences=True),
    Dense(1, activation="sigmoid")
])

In [18]:
model_word2vec.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy", precision(), recall()])

In [19]:
model_word2vec.fit(X_train_word2vec, y_train)

  return object.__getattribute__(self, name)




<keras.callbacks.History at 0x7f6d7116cd10>

In [20]:
model_word2vec.evaluate(X_test_word2vec,y_test)





[0.4602004885673523, 0.7475066184997559, 0.6634290218353271, 0.250779390335083]