In [1]:
import pymongo
import numpy as np
import pandas as pd
import pickle

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Reshape, Merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, activity_l2, l1, activity_l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils.layer_utils import layer_from_config
from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer

Using Theano backend.


In [2]:
client = pymongo.MongoClient('localhost', 27017)
twitter = client['twitter']

In [3]:
positive = twitter.positive
negative = twitter.negative

In [4]:
positive_texts = list(positive.find({}, {"text":1, "_id":0}))
negative_texts = list(negative.find({}, {"text":1, "_id":0}))

In [5]:
for text in positive_texts:
    text['sentiment'] = 1.0
    
for text in negative_texts:
    text['sentiment'] = 0.0

In [6]:
from thai_sentiment import Tokenizer

tokenizer = Tokenizer(dictionary_path='../data/tokenizer/dict/')

In [7]:
import re

regexp_thai = re.compile(u"[^\u0E00-\u0E7F']|^'|'$|''|'") 

def strip(text):
    stripped = text
    
    stripped = regexp_thai.sub("", stripped) 

    return stripped

In [8]:
def remove_single_character(tokens):
    for token in tokens:
        if len(token) == 1:
            tokens.remove(token)
            
    return tokens

In [9]:
stripped = strip('ทำไมฟอลครับ')
stripped

'ทำไมฟอลครับ'

In [10]:
tokenizer.tokenize(stripped)

['ทำไม', 'ฟอล', 'ครับ']

In [11]:
def tokenize_texts(texts):
    for text in texts:
        text['tokens'] = tokenize(text['text'])
    return texts

def tokenize(text):
    stripped = strip(text)
    tokens = tokenizer.tokenize(stripped)
    return tokens

In [12]:
import time

start = time.time()
positive_texts = tokenize_texts(positive_texts)
negative_texts = tokenize_texts(negative_texts)
end = time.time()
print(end - start)

169.71998929977417


In [13]:
def normalize(token):
    if len(token) <= 2 and token == token[::-1]:
        if token != 'งง':
            return ''
        else:
            return token
    else:
        return token

In [14]:
for text in positive_texts:
    new_tokens = []
    for token in text['tokens']:
        normalized = normalize(token)
        new_tokens.append(normalized) if len(normalized) > 0 else ''
    text['tokenized'] = text['tokens']
    text['tokens'] = new_tokens
        
for text in negative_texts:
    new_tokens = []
    for token in text['tokens']:
        normalized = normalize(token)
        new_tokens.append(normalized) if len(normalized) > 0 else ''
    text['tokenized'] = text['tokens']
    text['tokens'] = new_tokens

In [15]:
pickle.dump(positive_texts, open('../data/positive_texts.pickle', 'wb'))
pickle.dump(negative_texts, open('../data/negative_texts.pickle', 'wb'))

In [13]:
positive_texts = pickle.load(open('../data/positive_texts.pickle', 'rb'))
negative_texts = pickle.load(open('../data/negative_texts.pickle', 'rb'))

In [16]:
positive_corpus = list(text['tokens'] for text in positive_texts)
negative_corpus = list(text['tokens'] for text in negative_texts)

In [17]:
(len(positive_texts), len(negative_corpus))

(65218, 45175)

In [23]:
corpus = positive_corpus + negative_corpus

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=0.001, preprocessor=lambda x: x, tokenizer=lambda x: x)
tfidf_matrix = tfidf.fit_transform(corpus)

In [25]:
scores = zip(tfidf.get_feature_names(), np.asarray(tfidf_matrix.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

In [26]:
len(tfidf.get_feature_names())

1048

In [27]:
for item in sorted_scores[:10]:
    print("{0} Score: {1}".format(item[0], item[1]))

นะ Score: 4053.159158349886
ครับ Score: 3753.375160830107
พี่ Score: 3304.1604271736537
แล้ว Score: 3234.517986155034
ไม่ Score: 3120.435279155623
ก็ Score: 3049.299750173059
เลย Score: 2997.6326993027747
ไป Score: 2892.2679914066857
จะ Score: 2876.397709485032
เรา Score: 2623.261659993489


In [28]:
vocabs = list(score[0] for score in sorted_scores)

In [29]:
vocabs.append('oov')

In [30]:
word2idx = {v: k for k, v in enumerate(vocabs)}
idx2word = vocabs

In [31]:
x_train = []
labels_train = []
x_test = []
labels_test = []

for message in positive_corpus:
    tokens = []
    for token in message:
        try:
            tokens.append(word2idx[token])
        except:
            tokens.append(len(vocabs) - 1)
            
    if len(tokens) > 1 and np.sum(tokens) > 0:
        if len(labels_train) < 50000:
            x_train.append(tokens)
            labels_train.append(1)
        else:
            x_test.append(tokens)
            labels_test.append(1)

for message in negative_corpus:
    tokens = []
    for token in message:
        try:
            tokens.append(word2idx[token])
        except:
            tokens.append(len(vocabs) - 1)
            
    if len(tokens) > 1 and np.sum(tokens) > 0:
        if len(labels_train) < 90000:
            x_train.append(tokens)
            labels_train.append(0)
        else:
            x_test.append(tokens)
            labels_test.append(0)

In [32]:
x_train[0]

[29, 54, 317, 327, 0, 134, 1048]

In [33]:
(len(x_train), len(x_test))

(90000, 18723)

In [34]:
from sklearn.utils import shuffle
x_train, labels_train = shuffle(x_train, labels_train, random_state=0)

In [35]:
trn = [np.array([i for i in s]) for s in x_train]
test = [np.array([i for i in s]) for s in x_test]

In [36]:
trn[100]

array([  30,  490,   30, 1048])

In [37]:
lens = np.array(list(len(i) for i in trn))
(lens.max(), lens.min(), lens.mean())

(43, 2, 9.8900222222222229)

In [38]:
seq_len = 15

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [39]:
vocab_size = len(vocabs)
vocab_size

1049

In [40]:
conv1 = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len, dropout=0.5),
    Dropout(0.5),
    Convolution1D(64, 5, border_mode='same', activation='relu'),
    Dropout(0.5),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')])

conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [41]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=32, verbose=2)

Train on 90000 samples, validate on 18723 samples
Epoch 1/2
34s - loss: 0.5890 - acc: 0.6780 - val_loss: 0.5161 - val_acc: 0.7805
Epoch 2/2
39s - loss: 0.5666 - acc: 0.7000 - val_loss: 0.5127 - val_acc: 0.7886


<keras.callbacks.History at 0x121220c88>

In [186]:
conv1.save_weights('../data/conv1.h5')

In [115]:
conv1.load_weights('../data/conv1.h5')

In [222]:
conv1.evaluate(test, labels_test)



[0.55838080683611713, 0.71212691032172037]

In [52]:
sequence_length = seq_len
embedding_dim = 32    
filter_sizes = (3, 4)
num_filters = 3
dropout_prob = (0.2, 0.2)
hidden_dims = 100

graph_in = Input(shape=(sequence_length, embedding_dim))
convs = []
for fsz in filter_sizes:
    conv = Convolution1D(nb_filter=num_filters,
                         filter_length=fsz,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    pool = MaxPooling1D(pool_length=2)(conv)
    flatten = Flatten()(pool)
    convs.append(flatten)
    
if len(filter_sizes)>1:
    out = Merge(mode='concat')(convs)
else:
    out = convs[0]

graph = Model(input=graph_in, output=out)

# main sequential model
model = Sequential()
model.add(Embedding(len(vocabs), embedding_dim, input_length=sequence_length))
model.add(Dropout(dropout_prob[0], input_shape=(sequence_length, embedding_dim)))
model.add(graph)
model.add(Dense(hidden_dims, activation='relu')),
model.add(Dropout(dropout_prob[1]))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [59]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=32, verbose=2)

Train on 90000 samples, validate on 18723 samples
Epoch 1/2
15s - loss: 0.4512 - acc: 0.7885 - val_loss: 0.4657 - val_acc: 0.7785
Epoch 2/2
15s - loss: 0.4509 - acc: 0.7889 - val_loss: 0.4751 - val_acc: 0.7727


<keras.callbacks.History at 0x12725f048>

In [90]:
model.save_weights('../data/conv2.h5')

In [240]:
model.load_weights('../data/conv2.h5')

In [48]:
def text2test(text, vocabs, seq_len):
    test_tokens = tokenize(text)
    test_ids = list(word2idx[token] if token in vocabs else len(vocabs) - 1 for token in test_tokens)
    test_seq = sequence.pad_sequences([test_ids], maxlen=seq_len, value=0)
    return test_seq[0].reshape(1,seq_len)

In [64]:
x_combo = np.vstack((trn, test))
y_combo = labels_train + labels_test

In [75]:
model.fit(x_combo, y_combo, validation_data=(test, labels_test), nb_epoch=1, batch_size=32, verbose=2)

Train on 108723 samples, validate on 18723 samples
Epoch 1/1
19s - loss: 0.4494 - acc: 0.7895 - val_loss: 0.4397 - val_acc: 0.7912


<keras.callbacks.History at 0x1269b5588>

In [88]:
test_texts = ["ทำไมถึงมีปัญหา", "เบื่อที่ต้องมาตามงาน", 
              "ตอบกลับได้เร็ว แต่มั่วมาก", "น่าเบื่อ",
              "ขอด่าหน่อยเฮอะ", "ยากมาก",
              "รอเป็นชั่วโมง ยังไม่ได้ซักที", "รอนานแล้ว ก็ยังไม่มีคนตอบกลับ", 
              "พ่อแม่ไม่สั่งสอน", "ที่นี้บริการช้า", 
              "ไม่อยากจะยุ่งแล้ว", "ไม่ควรน่ารักขนาดนี้มั้ย"]

for text in test_texts:
    test_seq = text2test(text, vocabs, seq_len)
#     print((text, conv1.predict(test_seq)))
    print((text, model.predict(test_seq)))

('ทำไมถึงมีปัญหา', array([[ 0.17876832]], dtype=float32))
('เบื่อที่ต้องมาตามงาน', array([[ 0.30594844]], dtype=float32))
('ตอบกลับได้เร็ว แต่มั่วมาก', array([[ 0.55690104]], dtype=float32))
('น่าเบื่อ', array([[ 0.10341965]], dtype=float32))
('ขอด่าหน่อยเฮอะ', array([[ 0.28342849]], dtype=float32))
('ยากมาก', array([[ 0.15824707]], dtype=float32))
('รอเป็นชั่วโมง ยังไม่ได้ซักที', array([[ 0.24052288]], dtype=float32))
('รอนานแล้ว ก็ยังไม่มีคนตอบกลับ', array([[ 0.57998943]], dtype=float32))
('พ่อแม่ไม่สั่งสอน', array([[ 0.29850295]], dtype=float32))
('ที่นี้บริการช้า', array([[ 0.24150842]], dtype=float32))
('ไม่อยากจะยุ่งแล้ว', array([[ 0.25826687]], dtype=float32))
('ไม่ควรน่ารักขนาดนี้มั้ย', array([[ 0.54214513]], dtype=float32))


In [89]:
test_texts = ["เด็กวิศวะจุฬา นี้ระดับมันสมองอันดับ 1 ของประเทศ แต่ทำไมมีแนวคิดเพี้ยน ไปแบบนั้น ?",
             "แชร์เรื่องโกงๆ ในที่ทำงาน",
             "แชร์ประสบการณ์ถูกพนักงานแบงค์เขียวรูดบัตรเครดิตไปเกือบล้านบาท"]

for text in test_texts:
    test_seq = text2test(text, vocabs, seq_len)
    print((text, model.predict(test_seq)))

('เด็กวิศวะจุฬา นี้ระดับมันสมองอันดับ 1 ของประเทศ แต่ทำไมมีแนวคิดเพี้ยน ไปแบบนั้น ?', array([[ 0.39049369]], dtype=float32))
('แชร์เรื่องโกงๆ ในที่ทำงาน', array([[ 0.83812088]], dtype=float32))
('แชร์ประสบการณ์ถูกพนักงานแบงค์เขียวรูดบัตรเครดิตไปเกือบล้านบาท', array([[ 0.56037796]], dtype=float32))
