# <center> Deep NLP

## <center> Word Embeddings

<center> Capturing <b>semantic meaning</b>

<center><img src="embeddings.png">

In [None]:
import pandas as pd
import json
df = pd.read_json('sarcasm_data.json', lines=True, orient='records')
df.head()

In [None]:
corpus = df['headline'].values
sentiments = df['is_sarcastic'].values

In [None]:
corpus[12]

In [None]:
## create tokenizer, fit to corpus
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
tokenizer.num_words = 1000

In [None]:
tokenizer.word_index

In [None]:
## keras tokenizer skips index 0 so we increase vocab length by 1
vocab_length = 1000 + 1

In [None]:
## encode corpus
encoded_corpus = tokenizer.texts_to_sequences(corpus)
encoded_corpus

In [None]:
## find longest review
review_length = max([len(review) for review in encoded_corpus])
review_length

In [None]:
## pad all reviews to longest length
from keras.preprocessing.sequence import pad_sequences
padded_corpus = pad_sequences(encoded_corpus, review_length, padding='post')
padded_corpus

In [None]:
## train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_corpus, sentiments, test_size=0.2, random_state=4)

In [None]:
## build model with Embedding layer
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding

model = Sequential()
model.add(Embedding(vocab_length, 
                    50,
                    input_length=review_length))
model.add(Flatten())
model.add(Dense(500,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
## compile and fit
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, batch_size=512, epochs=10, verbose=1, validation_data=(X_test, y_test))

## <center> Using Pretrained Word Embeddings 

### <center> GloVe </center>
<center> <a href="https://nlp.stanford.edu/projects/glove/">https://nlp.stanford.edu/projects/glove/</a>

In [None]:
import numpy as np
def load_glove_embeddings(glove_file):
    print("Loading Glove Model")
    f = open(glove_file,'r',encoding='utf-8')
    embeddings_dictionary = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        embeddings_dictionary[word] = embedding
    print("Done.",len(embeddings_dictionary)," words loaded!")
    return embeddings_dictionary

In [None]:
glove_file = 'glove.6B.50d.txt'
embeddings_dictionary = load_glove_embeddings(glove_file)

In [None]:
import numpy as np
embedding_matrix = np.zeros((vocab_length, 50))
for word, index in tokenizer.word_index.items():
    if index>=vocab_length:
        continue
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
embedding_matrix

In [None]:
from keras.initializers import Constant
model = Sequential()
model.add(Embedding(vocab_length, 
                            50, 
                            weights=[embedding_matrix], 
                            input_length=review_length,
                            trainable=False))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
## compile and fit
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, batch_size=512, epochs=10, verbose=1, validation_data=(X_test, y_test))

## <center> Long Short Term Memory Networks (LSTM)

https://www.youtube.com/watch?v=8HyCNIVRbSU

In [None]:
## import the dataset
import pandas as pd
df = pd.read_csv('trump_tweets.csv')
df.head()

In [None]:
## remove URLs from tweets
import re
num_tweets = 5000
corpus = [re.sub('http[s]?://\S+', '', tweet).lower() + ' endoftweet' for tweet in df['Tweet_Text'][0:num_tweets].values]

In [None]:
corpus

In [None]:
## create tokenizer object and fit it to corpus, check the total number of unique words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
tokenizer.num_words = 2000
num_words = len(tokenizer.word_index)
num_words

In [None]:
## encode the corpus
encoded_corpus = tokenizer.texts_to_sequences(corpus)
encoded_corpus

In [None]:
## create X where each data point is the previous 3 words
## create y where each data point is the following word
X = []
y = []
for tweet in encoded_corpus:
    for index in range(len(tweet)):
        if index>2:
            X.append(tweet[index-3:index])
            y.append(tweet[index])

In [None]:
## reshape X, convert y to categorical, do a train test split
from keras.utils import to_categorical
X = np.array(X)
X = X.reshape(X.shape[0],X.shape[1])
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)

In [None]:
## build model 
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(2000, output_dim=100, input_length=3))
model.add(LSTM(128))
model.add(Dense(500))
model.add(Dense(2000, activation='softmax'))
model.summary()

In [None]:
## compile and fit model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=1024, epochs=40, validation_data=(X_test, y_test))

In [None]:
## generate new tweets
seed = "i am very"
tweet = seed
for i in range(1000):
    pred = np.argmax(model.predict(np.array(tokenizer.texts_to_sequences([seed]))))
    pred_word = tokenizer.sequences_to_texts([[pred]])[0]
    if pred_word == 'endoftweet':
        break
    seed_list = seed.split()
    seed_list[0] = seed_list[1]
    seed_list[1] = seed_list[2]
    seed_list[2] = pred_word
    seed = ' '.join(seed_list)
    tweet = tweet + ' ' + pred_word
print(tweet)

## <center> Generative LSTM Example

https://lyric-writer.herokuapp.com/

## <center> Activity

Choose one of the following options: <br> <br>
1) <b>Toxic Comment Classification</b> - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview
    - Build a neural network model that can accurately classify online comments as toxic/non-toxic.
2) <b>Project Gutenberg Book Text Generation</b> - https://www.gutenberg.org/ebooks/search/?sort_order=downloads
    - Choose a book and develop a neural network that can generate realistic text in the same style.