In [1]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.utils import shuffle
from utils import *
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from tqdm import tqdm
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 18957
Most common words [('yang', 14891), ('dan', 8177), ('tidak', 4578), ('untuk', 4023), ('dengan', 3349), ('filem', 3279)]
Sample data [1657, 206, 5, 161, 217, 106, 304, 4, 79, 202] ['ringkas', 'bodoh', 'dan', 'membosankan', 'kanak-kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [7]:
def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [8]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer

Using TensorFlow backend.


In [9]:
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
file_path = "sentiment_best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")

In [10]:
vectors = str_idx(texts, dictionary, 100)
onehot = np.zeros((len(vectors),2))
onehot[np.arange(len(vectors)),labels] = 1

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, 
                                                    onehot,
                                                    test_size = 0.2)

In [12]:
inp = Input(shape = (None,))
x = Embedding(len(dictionary), 256, trainable=True)(inp)
x1 = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(128, return_sequences = True))(x1)
x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
y = Bidirectional(LSTM(128, return_sequences = True))(x1)
y = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
avg_pool1 = GlobalAveragePooling1D()(x)
max_pool1 = GlobalMaxPooling1D()(x)
    
avg_pool2 = GlobalAveragePooling1D()(y)
max_pool2 = GlobalMaxPooling1D()(y)
    
x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])

x = Dense(2, activation = "softmax")(x)
model = Model(inputs = inp, outputs = x)
model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = 1e-3), metrics = ["accuracy"])
history = model.fit(train_X, train_Y, batch_size = 128, epochs = 10, validation_data = (test_X, test_Y), 
                    verbose = 1, callbacks = [check_point, early_stop])

Train on 11423 samples, validate on 2856 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.54990, saving model to sentiment_best_model.hdf5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.54990
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.54990
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.54990


In [14]:
model = load_model(file_path)
predict_Y = np.argmax(model.predict(test_X,batch_size=128,verbose=1),1)



In [15]:
from sklearn import metrics
print(metrics.classification_report(np.argmax(test_Y,1), predict_Y, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.68      0.69      0.69      1302
   positive       0.74      0.73      0.74      1554

avg / total       0.71      0.71      0.71      2856



In [16]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = str_idx([text],dictionary,len(text.split()))
model.predict(new_vector)

array([[0.7262973 , 0.27370268]], dtype=float32)

In [17]:
text = 'saya sangat sayangkan kerajaan saya'
new_vector = str_idx([text],dictionary,len(text.split()))
model.predict(new_vector)

array([[0.3166659, 0.6833341]], dtype=float32)

In [18]:
text = 'bodoh lah awak ni'
new_vector = str_idx([text],dictionary,len(text.split()))
model.predict(new_vector)

array([[0.6569448 , 0.34305516]], dtype=float32)

In [19]:
text = 'kerajaan sebenarnya sangat baik'
new_vector = str_idx([text],dictionary,len(text.split()))
model.predict(new_vector)

array([[0.4587405 , 0.54125947]], dtype=float32)

In [20]:
import json
with open('fast-text-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))