# Training Data For Sentiment Analysis

In [1]:
import pandas as pd
DATASET_ENCODING = "ISO-8859-1"
data = pd.read_csv('Clean_Tweets_Indonesia_fix.csv', encoding =DATASET_ENCODING)
data['text']=data['text'].apply(str)
X = data.iloc[:,[1]]

In [2]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
print("TRAIN size:", len(X_train))
print("TEST size:", len(X_train))

TRAIN size: 582686
TEST size: 582686


In [3]:
import gensim

# WORD2VEC 
W2V_SIZE = 250
W2V_WINDOW = 5
W2V_EPOCH = 32
W2V_MIN_COUNT = 8

documents = [_text.split() for _text in X_train.text] 
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)
w2v_model.build_vocab(documents)

In [4]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 31148


In [5]:
# Train Word Embeddings
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

(186167143, 211893536)

In [14]:
w2v_model.most_similar("jelek")

  """Entry point for launching an IPython kernel.


[('jelekin', 0.5206229090690613),
 ('bagus', 0.44332534074783325),
 ('benci', 0.4259550869464874),
 ('salah', 0.4066743850708008),
 ('fitnah', 0.40463048219680786),
 ('yg', 0.4039088487625122),
 ('bela', 0.40158525109291077),
 ('ngejelek', 0.40026962757110596),
 ('lu', 0.3976461887359619),
 ('hina', 0.39702296257019043)]

In [7]:
# Tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.utils.np_utils import to_categorical

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 300
# This is fixed.
EMBEDDING_DIM = 300

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train.text)
word_index = tokenizer.word_index
vocab_size = len(word_index)
print('Found %s unique tokens.' % len(word_index))

# Convert the data to padded sequences
X_train_padded = tokenizer.texts_to_sequences(X_train.text)
X_train_padded = pad_sequences(X_train_padded, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_train_padded.shape)

Using TensorFlow backend.


Found 142065 unique tokens.
Shape of data tensor: (582686, 300)


In [8]:
# saving
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
import numpy as np
embedding_matrix = np.zeros((vocab_size+1, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(142066, 250)


In [10]:
import keras 

model = Sequential()
model.add(Embedding(vocab_size+1, W2V_SIZE, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 250)          35516500  
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 250)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               140400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 35,657,001
Trainable params: 140,501
Non-trainable params: 35,516,500
_________________________________________________________________


In [11]:
from keras.models import load_model
model = load_model('Sentiment_LSTM_model.h5')

In [12]:
def predict(text, include_neutral=True):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    if(score >=0.4 and score<=0.6):
        label = "Neutral"
    if(score <=0.4):
        label = "Negative"
    if(score >=0.6):
        label = "Positive"

    return {"label" : label,
        "score": float(score)}

In [19]:
predict("")

{'label': 'Positive', 'score': 0.7290142178535461}