In [236]:
import numpy as np
import pandas as pd

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM



# Packages for data preparation
# from sklearn.model_selection import train_test_split # Dont need because data folder has already spilt
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

from textblob import TextBlob
import re, string, unicodedata
import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer


#For Training Multinomial Naive Bayess
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV


Explains embeddings very well: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

GloVe embeddings: https://nlp.stanford.edu/projects/glove/

# General Notes and Presentation Notes

* After I have a working model, I can concatenate training and test sets because my testing will be with streamed tweets! 
* There is something wrong with the way that I'm preprocessing tweets, or maybe the word embedding files that I'm using does not contain what I expect
* Naive Bayes VS Glove Embeddings w/ LSTM VS Glove Embeddings w/ stacked LSTMs VS Embeddings + CNN VS ELMO
* When we incorporate backend use the following article for preprocessing:

https://towardsdatascience.com/extracting-twitter-data-pre-processing-and-sentiment-analysis-using-python-3-0-7192bd8b47cf?gi=a2d3e63ba57a

* Great for explaining Word EMbeddings


In [237]:
from nltk.tokenize import sent_tokenize, word_tokenize

data = pd.read_csv("sa-emotions/train_data.csv")


In [238]:
data.head()

data["sentiment"].unique()

data.describe()

data = data[0:50]

data["content"] = data.apply(lambda row: word_tokenize(row['content']), axis=1)

# data

In [242]:
len(data["sentiment"].unique())

10

## Text Preprocessing

In [207]:
''' Download for lemmantizing words'''
# nltk.download('wordnet')

' Download for lemmantizing words'

In [208]:

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def spell_correct(words):
    """ Fix spelling corrections """
    corrected_words = []
    
    for word in words:
        corrected_words.append(TextBlob(word).correct().raw)

    return corrected_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = lemmatize_verbs(words)
    words = remove_stopwords(words)
    words = spell_correct(words)
    return words

# words = normalize(X_train)
data["content"] = data["content"].apply(lambda x: " ".join(normalize(x)))

In [209]:
data.head()

Unnamed: 0,sentiment,content
0,empty,tiffanylue know listening bad habit earlier st...
1,sadness,laying n bed headache ughhhh waiting call
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,want hang friends soon
4,neutral,dannycastillo want trade someone houston ticke...


## Naive Bayes w/ TFIDF
* We make the assumption that the features are conditionally independent given an output class. This assumption is most likely not true — hence the name naive Bayes classifier, but the classifier nonetheless performs well in most situations.

In [214]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


tf = TfidfVectorizer()
text_tf = tf.fit_transform(data['content'])

X_train, X_test, y_train, y_test = train_test_split(text_tf, data['sentiment'], test_size=0.3, random_state=123)

# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)

print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.2


## Tokenize each tweet to be an array of words

In [217]:
X_train = data.apply(lambda row: word_tokenize(row['content']), axis=1)
# X_train

In [218]:
# Label encoding outcome classes
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(data['sentiment'].unique())
output_classes = label_encoder.transform(data['sentiment'].unique()) 
output_classes

array([0, 7, 1, 6, 9, 8, 5, 2, 4, 3])

### Keras Tokenizer will convert the tweets to sequences so that they can be passed through embedding matrices.

In [219]:
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(X_train)

train_tokenized = tk.texts_to_sequences(X_train)
# test_tokenized = tk.texts_to_sequences(test['tweet'])

max_len = 100
X_train = pad_sequences(train_tokenized, maxlen = max_len)


# labels = to_categorical(np.asarray(data['sentiment'].unique()))

# X_test = pad_sequences(test_tokenized, maxlen = max_len)
# X_train

**fit_on_texts** Updates internal vocabulary based on a list of texts. This method creates the vocabulary index based on word frequency. So if you give it something like, "The cat sat on the mat." It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value. 0 is reserved for padding. So lower integer means more frequent word (often the first few are stop words because they appear a lot).

**texts_to_sequences** Transforms each text in texts to a sequence of integers. So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary. Nothing more, nothing less, certainly no magic involved.

### Loading Glove embeddings into dictionary [MEMORY ISSUE]

In [220]:
embeddings_index = {}
file = open('../data/glove.twitter.27B/glove.twitter.27B.100d.txt')
for line in file:
    values = line.split(' ')
    word = values[0] # The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
file.close()

print('GloVe data loaded')

GloVe data loaded


In [243]:
from numpy import zeros

'''
Create a matrix of one embedding for each word in the training dataset. 
We can do that by enumerating all unique words in the Tokenizer.word_index 
and locating the embedding weight vector from the loaded GloVe embedding.
'''

vocab_size = len(tk.word_index) + 1

embedding_matrix = zeros((vocab_size, 100))
for word, i in tk.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [221]:
len(tk.word_index)

270

In [222]:
len(output_classes)

10

In [259]:
NB_WORDS = 270  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = len(X_train)  # Size of the validation set
NB_START_EPOCHS = 10  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 100  # Maximum number of words in a sequence
GLOVE_DIM = 100  # Number of dimensions of the GloVe word embeddings
# root = Path('../')
# input_path = root / 'input/'
# ouput_path = root / 'output/'
# source_path = root / 'source/'

array([0, 7, 1, 6, 9, 8, 5, 2, 4, 3])

In [266]:
emb_model = Sequential()
emb_model.add(Embedding(NB_WORDS, 8, input_length=MAX_LEN))
emb_model.add(Flatten())
emb_model.add(Dense(10, activation='softmax'))
# emb_history = deep_model(emb_model, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)
# compile the model
emb_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(emb_model.summary())
# fit the model
emb_model.fit(X_train, output_classes, epochs=5, verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 100, 8)            2160      
_________________________________________________________________
flatten_14 (Flatten)         (None, 800)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 10)                8010      
Total params: 10,170
Trainable params: 10,170
Non-trainable params: 0
_________________________________________________________________
None


ValueError: Error when checking target: expected dense_14 to have shape (10,) but got array with shape (1,)

### Building RNN Model

If you wish to connect a Dense layer directly to an Embedding layer, you must first flatten the 2D output matrix to a 1D vector using the Flatten layer.

We chose the 100-dimensional version, therefore the Embedding layer must be defined with output_dim set to 100.

In [247]:
# Want to add Stacked LSTM

# Added Recurrent single LSTM layer, 
# model.add(LSTM(64, return_sequences=False,dropout=0.1, recurrent_dropout=0.1))
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten

# Init model
model = Sequential()
# Embedding layer using Glove Pretrained weights
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=100, trainable=False))
# Flattening to conncet to Dense layer
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(X_train, output_classes, epochs=50, verbose=0)
# evaluate the model
# loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          27100     
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                100010    
Total params: 127,110
Trainable params: 100,010
Non-trainable params: 27,100
_________________________________________________________________
None


ValueError: Error when checking target: expected dense_1 to have shape (10,) but got array with shape (1,)

## Data Preprocessing Notes
* Need to remove mentions


 ### Medium Research Notes:

If we need to create a embedding mapping for words with training, how does this affect how we embed the test data (streamed tweets)? Maybe just use Glove.

Shall we use Embedding layer as our input layer?

Preprocessing
KD article
Padding
Remove stop words


CNN training is 1.5 times faster than training LSTMs

Need to determine the unique vocabulary of training in order to determine embedding vector length, then I think you pad off of that? Not sure about the padding preprocessing step.

Pathtoword embeddings is the path where we’ve  downloaded word embeddings via glove or number batch

Is it cheating if I understand all the processing of what this article is doing, and add my own preprocessing, and maybe additional steps? 
https://medium.com/@panghalarsh/sentiment-analysis-in-python-using-keras-glove-twitter-word-embeddings-and-deep-rnn-on-a-combined-580646cb900a

Word Embeddings is better than count vectorizers that one hot encodes sentences because the bagofwords model will not scale well to large datasets due to create sparse vectors in high dimensionality. Also we lose semantics through BagofWords approach. Word embeddings are dense vectors with much lower dimensionality. Secondly, the semantic relationships between words are reflected in the distance and direction of the vectors.

Should read into each Keras library being utilized and understanding 


Need to encode output classes (emotions) w/ LabelEncoders

Why do we need to Flatten after we do embedding? Like what does that do?

In the Embedding layer (which is layer 0 here) we set the weights for the words to those found in the GloVe word embeddings. By setting trainable to False we make sure that the GloVe word embeddings cannot be changed.

The best result is achieved with 100-dimensional word embeddings that are trained on the available data. By doing this, we do not take into account the relationships between the words in the tweet. This can be achieved with a recurrent neural network or a 1D convolutional network. But that’s something for a future post.
— Bert Carremans 


Do we need padding when utilizing word embeddings? I saw some Word2Vec tutorials using padding one hot encoded sequences, but my understanding was that we don’t need it for embeddings.
https://medium.com/datadriveninvestor/sentiment-analysis-using-embeddings-f3dd99aeaade


Why train my own word embeddings at all? Why not always use Glove? It seems to be a better trained for my model’s embedding layer anyways.


There are so many state of the art embedding techniques like SkipGrams, FastText, and ELMo. I should investigate these for their result to text summarizations.


In [14]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Flatten
import numpy as np

# check this out:
# https://stats.stackexchange.com/questions/270546/how-does-keras-embedding-layer-work
model = Sequential()
model.add(Embedding(7, 3, input_length=5))
# model.add(Flatten())
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be
# no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.array([[0, 1, 2, 3, 4], [5, 1, 2, 3, 6]])
print(input_array)
# model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
print(output_array)

[[0 1 2 3 4]
 [5 1 2 3 6]]
[[[ 0.01658997 -0.03503971 -0.03524492]
  [ 0.02105062 -0.02567688 -0.02039844]
  [-0.02736325 -0.00165455  0.00108389]
  [-0.01605218  0.04671435 -0.04131665]
  [ 0.02627129 -0.02832104  0.01373578]]

 [[-0.03747336 -0.02579566  0.03765706]
  [ 0.02105062 -0.02567688 -0.02039844]
  [-0.02736325 -0.00165455  0.00108389]
  [-0.01605218  0.04671435 -0.04131665]
  [-0.04258297  0.01286627  0.01142017]]]


## Using ELMo's pretrained model
* When we import this model, should keep it trainable?