# Sentimental Analysis
This notebook shows how to create a model to analyze the mood of an user by his typed text. It is based on excelennt tutorial about classifying tweets: https://vgpena.github.io/classifying-tweets-with-keras-and-tensorflow/

The dataset can be downloaded here: http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [1]:
import numpy as np
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Dropout, Activation

Using Theano backend.


Common constants for the notebook.

In [2]:
path = 'datasets/sentimental/'
model_weights = 'model.h5'
json_model_file = 'model.json'

# only work with the 3000 most popular words found in the dataset
max_words = 3000

# Build the model

The building of the model can be skipped if it was done before, and the model file and weights  have been saved. However retrain on a new dataset is required.

In [3]:
def text_to_array(dictionary, text):
    """ 
    make all texts the same length -- in this case, the length
    of the longest text in the set.
    """
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

In [4]:
def build_model(max_words, train_x, train_y):
    model = Sequential()
    model.add(Dense(512, input_shape=(max_words,), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(train_x, train_y, batch_size=32, epochs=5, verbose=1,validation_split=0.1, shuffle=True)
    return model

In [5]:
def training_data():
    # extract data from a csv
    training = np.genfromtxt(path + 'sentiment-analysis.csv', delimiter=',', skip_header=1, 
                         usecols=(1, 3), dtype=None, encoding=None)
    # create training data
    train_x = [x[1] for x in training]
    # index all the labels
    train_y = np.asarray([x[0] for x in training])
    return train_x, train_y

In [6]:
def create_dictionary(tokenizer):
    # Tokenizers come with a convenient list of words and IDs
    dictionary = tokenizer.word_index
    # save this out so we can use it later
    with open(path + 'dictionary.json', 'w') as dictionary_file: 
        json.dump(dictionary, dictionary_file)
    return dictionary

In [7]:
train_x, train_y = training_data()
# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)
# feed the tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)
dictionary = create_dictionary(tokenizer)

In [8]:
# for each tweet, change each token to its ID in the Tokenizer's word_index
allWordIndices = []
for text in train_x:
    wordIndices = text_to_array(dictionary, text)
    allWordIndices.append(wordIndices)

In [9]:
# create matrices out of the indexed tweets
train_X = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
# treat the labels as categories
train_Y = keras.utils.to_categorical(train_y, 2)

In [10]:
model = build_model(max_words, train_X, train_Y)

Train on 1420764 samples, validate on 157863 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
#save model and weights
json_model = model.to_json()
with open(path + json_model_file, 'w') as json_file:
    json_file.write(json_model)

model.save_weights(path + model_weights)

# Use the model to classify an input

Here we are going to use the model to classify an input.

In [12]:
def words_to_index_array(text):
    """ 
    make sure that all the words in your input
    are registered in the dictionary
    before trying to turn them into a matrix.
    """
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training data; ignoring." %(word))
    return wordIndices

In [13]:
def eval_sentence(sentence, tokenizer, model):
    if len(sentence) > 0:
        # format input for the neural net
        testArr = words_to_index_array(sentence)
        input = tokenizer.sequences_to_matrix([testArr], mode='binary')
        # predict which bucket input belongs in
        pred = model.predict(input)
        # for human-friendly printing
        labels = ['negative', 'positive']
        print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

The following step can be skipped, if the model was just trained before and is still in memory.

In [14]:
# read in saved dictionary
with open(path + 'dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)
    
# read saved model structure
json_file = open(path + json_model_file, 'r')
loaded_model_json = json_file.read()
json_file.close()

# create a model from that
model = model_from_json(loaded_model_json)
# weight nodes with saved values
model.load_weights(path + model_weights)

In [18]:
# new tokenizer
tokenizer = Tokenizer(num_words=max_words)

In [20]:
eval_sentence('What a nice day', tokenizer, model)

positive sentiment; 79.433322% confidence


In [22]:
eval_sentence('I like to kiss you', tokenizer, model)

positive sentiment; 66.858613% confidence


In [23]:
eval_sentence('This product is crap.', tokenizer, model)

negative sentiment; 90.897590% confidence
