In [1]:
import keras
import pandas
import numpy
import sklearn.metrics
import matplotlib.pyplot
import seaborn

Using TensorFlow backend.


In [None]:
NUMBER_TWEETS = 100000
VOCABULARY_SIZE = 50000
MAX_NUM_WORDS = 20 # max number of words to use to represent each tweet - tweets will be shaped to have this as their max length
EMBEDDING_SIZE = 100
BATCH_SIZE = 10000
NUM_EPOCHS = 10

In [None]:
def get_binary(x):
    if x=='positive':
        return 1
    elif x=='negative':
        return 0
    else:
        return 'NULL'

In [None]:
tweets = pandas.read_csv('/Users/laurapallett/data/niclas_thomas/airline_tweets/Tweets.csv')[['airline_sentiment','text']]

In [None]:
tweets = tweets[tweets['airline_sentiment'].isin(['positive','negative'])]
tweets['clean_text'] = tweets['text'].str.replace('\@','')
tweets['sentiment'] = tweets['airline_sentiment'].apply(lambda x: get_binary(x))
tweets.head()

## Process Raw Text

In [None]:
texts = tweets['text'].tolist()[0:NUMBER_TWEETS]
y_train = tweets['sentiment'].tolist()[0:NUMBER_TWEETS]

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCABULARY_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [None]:
X_train = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_NUM_WORDS)
X_test = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_NUM_WORDS)

In [None]:
X_train[0:10]

In [None]:
texts[0:10]

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=VOCABULARY_SIZE, output_dim=EMBEDDING_SIZE, input_length=MAX_NUM_WORDS, name='embedding'))
model.add(keras.layers.LSTM(100))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [None]:
print(model.summary())

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)

In [None]:
model.predict(X_test[0:10])

## Get Word Embedding and Find Most Similar Words

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
similarities = numpy.round(sklearn.metrics.pairwise.cosine_similarity(weights), 2)

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(10,10))
seaborn.heatmap(weights)
ax.set_xticks()
matplotlib.pyplot.show()