# <center> Deep NLP

## <center> Word Embeddings

<center> Capturing <b>semantic meaning</b>

<center><img src="embeddings.png">

In [1]:
import pandas as pd
import json
df = pd.read_json('sarcasm_data.json', lines=True, orient='records')
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.theonion.com/thirtysomething-scien...,thirtysomething scientists unveil doomsday clo...,1
1,https://www.huffingtonpost.com/entry/donna-edw...,dem rep. totally nails why congress is falling...,0
2,https://www.huffingtonpost.com/entry/eat-your-...,eat your veggies: 9 deliciously different recipes,0
3,https://local.theonion.com/inclement-weather-p...,inclement weather prevents liar from getting t...,1
4,https://www.theonion.com/mother-comes-pretty-c...,mother comes pretty close to using word 'strea...,1


In [2]:
corpus = df['headline'].values
sentiments = df['is_sarcastic'].values

In [3]:
corpus[12]

'what to know regarding current treatments for ebola'

In [4]:
## create tokenizer, fit to corpus
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
tokenizer.num_words = 1000

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
tokenizer.word_index

{'camera': 1631,
 'unfit': 7783,
 'clearer': 11560,
 'severance': 15709,
 'nothing': 568,
 'affordable': 2957,
 'pumping': 8115,
 "2015's": 25442,
 "supermodel's": 27869,
 "toaster's": 30258,
 'tiles': 27609,
 "god'": 14512,
 "rubio's": 5090,
 'adams': 12663,
 "weekly'": 14984,
 'bay': 4690,
 "'ed": 21934,
 'overprescribe': 24713,
 'solutions': 4267,
 'inclusion': 6279,
 'stun': 9482,
 'nidetch': 20706,
 '99': 3310,
 'california': 482,
 'bane': 23224,
 "flynn's": 14590,
 'kennel': 27199,
 'furiously': 28077,
 'coloradans': 28172,
 'silences': 13622,
 'gifford': 13405,
 'feathers': 25197,
 "'chilling'": 26260,
 "biden's": 3822,
 'enthusiasm': 16466,
 'habits': 2585,
 'mangles': 30704,
 'ignored': 4820,
 'brine': 25252,
 'ambulances': 19885,
 'challenging': 7961,
 'scientologist': 21969,
 'clarinet': 11825,
 'serta': 28491,
 'jowl': 29865,
 'grubhub': 25304,
 'merge': 6098,
 'claritin': 14803,
 'mika': 20503,
 'morally': 10262,
 "lipped'": 24482,
 'semiotism': 16433,
 'believable': 9469,

In [6]:
## keras tokenizer skips index 0 so we increase vocab length by 1
vocab_length = 1000 + 1

In [9]:
## encode corpus
encoded_corpus = tokenizer.texts_to_sequences(corpus)
encoded_corpus

[[354, 2, 660],
 [757, 46, 238, 10, 7],
 [862, 32, 261, 646],
 [15, 142, 1, 151],
 [471, 494, 321, 921, 1, 571, 549],
 [82, 72],
 [83, 279, 1, 32, 9, 572],
 [776, 24, 226, 24, 2],
 [291, 142, 138, 1, 424, 4, 355],
 [2, 159, 114, 20],
 [20, 10, 6, 425, 4, 8, 68, 42, 14, 25],
 [647, 59, 535, 200, 10, 625],
 [34, 1, 114, 5],
 [573, 952, 148, 84, 75, 1, 5, 2, 2],
 [11, 25, 7],
 [378, 79, 15, 12],
 [49, 432, 8, 2],
 [49, 13, 322, 99, 2, 30, 17],
 [536, 58, 410, 9, 342],
 [97, 57, 1, 19, 1, 415, 32, 483, 258],
 [395, 4, 883, 24, 2, 816],
 [4, 127, 262, 230, 1, 309, 2],
 [27, 1, 239, 1, 23],
 [550, 107, 4, 252, 187, 178, 14, 19],
 [105, 84, 79, 369, 484, 506],
 [385, 472, 28, 107, 385, 16, 102, 661, 542],
 [20, 11, 3],
 [140, 758, 4, 3, 56, 455, 2, 328, 6, 199],
 [4, 6, 292, 157, 100, 262],
 [551, 26, 275, 104, 1, 329, 21, 2, 3, 8, 133, 26],
 [121, 64, 742, 15, 6],
 [9, 537],
 [115, 70, 507, 14, 485],
 [977, 17, 456, 512, 309, 7, 309, 47],
 [11, 280, 293, 863, 5],
 [4, 10],
 [71, 125, 2, 3, 6

In [10]:
## find longest review
review_length = max([len(review) for review in encoded_corpus])
review_length

62

In [11]:
## pad all reviews to longest length
from keras.preprocessing.sequence import pad_sequences
padded_corpus = pad_sequences(encoded_corpus, review_length, padding='post')
padded_corpus

array([[354,   2, 660, ...,   0,   0,   0],
       [757,  46, 238, ...,   0,   0,   0],
       [862,  32, 261, ...,   0,   0,   0],
       ...,
       [  3,  99, 628, ...,   0,   0,   0],
       [ 17, 837,   0, ...,   0,   0,   0],
       [216,  20,  28, ...,   0,   0,   0]])

In [12]:
## train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_corpus, sentiments, test_size=0.2, random_state=4)

In [13]:
## build model with Embedding layer
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding

model = Sequential()
model.add(Embedding(vocab_length, 
                    50,
                    input_length=review_length))
model.add(Flatten())
model.add(Dense(500,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 62, 50)            50050     
_________________________________________________________________
flatten_1 (Flatten)          (None, 3100)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               1550500   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 501       
Total params: 1,601,051
Trainable params: 1,601,051
Non-trainable params: 0
_________________________________________________________________


In [15]:
## compile and fit
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, batch_size=512, epochs=10, verbose=1, validation_data=(X_test, y_test))

Train on 22895 samples, validate on 5724 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20ad0545fd0>

## <center> Using Pretrained Word Embeddings 

### <center> GloVe </center>
<center> <a href="https://nlp.stanford.edu/projects/glove/">https://nlp.stanford.edu/projects/glove/</a>

In [16]:
import numpy as np
def load_glove_embeddings(glove_file):
    print("Loading Glove Model")
    f = open(glove_file,'r',encoding='utf-8')
    embeddings_dictionary = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        embeddings_dictionary[word] = embedding
    print("Done.",len(embeddings_dictionary)," words loaded!")
    return embeddings_dictionary

In [17]:
glove_file = 'glove.6B.50d.txt'
embeddings_dictionary = load_glove_embeddings(glove_file)

Loading Glove Model
Done. 400000  words loaded!


In [18]:
import numpy as np
embedding_matrix = np.zeros((vocab_length, 50))
for word, index in tokenizer.word_index.items():
    if index>=vocab_length:
        continue
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [19]:
embedding_matrix

array([[ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.68047  , -0.039263 ,  0.30186  , ..., -0.073297 , -0.064699 ,
        -0.26044  ],
       [ 0.70853  ,  0.57088  , -0.4716   , ..., -0.22562  , -0.093918 ,
        -0.80375  ],
       ...,
       [ 0.19854  ,  0.23367  , -0.13229  , ...,  1.2378   , -0.0029151,
         0.27317  ],
       [ 0.30902  ,  0.29311  , -0.27832  , ..., -0.49816  , -0.044452 ,
         1.2749   ],
       [ 0.37214  , -0.31333  , -0.25042  , ...,  0.29813  , -0.076025 ,
         0.80216  ]])

In [20]:
from keras.initializers import Constant
model = Sequential()
model.add(Embedding(vocab_length, 
                            50, 
                            weights=[embedding_matrix], 
                            input_length=review_length,
                            trainable=False))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 62, 50)            50050     
_________________________________________________________________
flatten_2 (Flatten)          (None, 3100)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 500)               1550500   
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 501       
Total params: 1,601,051
Trainable params: 1,551,001
Non-trainable params: 50,050
_________________________________________________________________


In [22]:
## compile and fit
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, batch_size=512, epochs=10, verbose=1, validation_data=(X_test, y_test))

Train on 22895 samples, validate on 5724 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20d28c83ba8>

## <center> Long Short Term Memory Networks (LSTM)

https://www.youtube.com/watch?v=8HyCNIVRbSU

In [24]:
## import the dataset
import pandas as pd
df = pd.read_csv('trump_tweets.csv')
df.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [192]:
## remove URLs from tweets
import re
num_tweets = 5000
corpus = [re.sub('http[s]?://\S+', '', tweet).lower() + ' endoftweet' for tweet in df['Tweet_Text'][0:num_tweets].values]

In [193]:
corpus

['today we express our deepest gratitude to all those who have served in our armed forces. #thankavet  endoftweet',
 'busy day planned in new york. will soon be making some very important decisions on the people who will be running our government! endoftweet',
 'love the fact that the small groups of protesters last night have passion for our great country. we will all come together and be proud! endoftweet',
 'just had a very open and successful presidential election. now professional protesters, incited by the media, are protesting. very unfair! endoftweet',
 'a fantastic day in d.c. met with president obama for first time. really good meeting, great chemistry. melania liked mrs. o a lot! endoftweet',
 'happy 241st birthday to the u.s. marine corps! thank you for your service!!  endoftweet',
 'such a beautiful and important evening! the forgotten man and woman will never be forgotten again. we will all come together as never before endoftweet',
 'watching the returns at 9:45pm.\n#ele

In [236]:
## create tokenizer object and fit it to corpus, check the total number of unique words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
tokenizer.num_words = 2000
num_words = len(tokenizer.word_index)
num_words

8104

In [237]:
## encode the corpus
encoded_corpus = tokenizer.texts_to_sequences(corpus)
encoded_corpus

[[87, 20, 45, 3, 34, 450, 54, 29, 1624, 5, 45, 1864, 1],
 [153,
  1182,
  5,
  42,
  245,
  12,
  167,
  15,
  353,
  471,
  52,
  660,
  1445,
  11,
  2,
  39,
  54,
  12,
  15,
  451,
  45,
  688,
  1],
 [131,
  2,
  452,
  19,
  2,
  875,
  997,
  10,
  935,
  91,
  97,
  29,
  13,
  45,
  16,
  96,
  20,
  12,
  34,
  367,
  207,
  6,
  15,
  336,
  1],
 [36,
  132,
  4,
  52,
  584,
  6,
  1076,
  230,
  264,
  49,
  1077,
  935,
  41,
  2,
  98,
  28,
  52,
  721,
  1],
 [4,
  337,
  153,
  5,
  436,
  265,
  1078,
  26,
  71,
  129,
  13,
  208,
  76,
  149,
  120,
  391,
  16,
  499,
  1625,
  1865,
  768,
  4,
  522,
  1],
 [368, 998, 3, 2, 111, 114, 14, 7, 13, 55, 1079, 1],
 [324,
  4,
  585,
  6,
  660,
  299,
  2,
  1446,
  306,
  6,
  563,
  12,
  94,
  15,
  1446,
  53,
  20,
  12,
  34,
  367,
  207,
  61,
  94,
  406,
  1],
 [257, 2, 1183, 21, 348, 118, 1],
 [51,
  453,
  324,
  4,
  1626,
  3,
  65,
  13,
  33,
  936,
  13,
  71,
  10,
  2,
  454,
  330,
  48,
  55,
  

In [238]:
## create X where each data point is the previous 3 words
## create y where each data point is the following word
X = []
y = []
for tweet in encoded_corpus:
    for index in range(len(tweet)):
        if index>2:
            X.append(tweet[index-3:index])
            y.append(tweet[index])

In [239]:
## reshape X, convert y to categorical, do a train test split
from keras.utils import to_categorical
X = np.array(X)
X = X.reshape(X.shape[0],X.shape[1])
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)

In [240]:
## build model 
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(2000, output_dim=100, input_length=3))
model.add(LSTM(128))
model.add(Dense(500))
model.add(Dense(2000, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 3, 100)            200000    
_________________________________________________________________
lstm_15 (LSTM)               (None, 128)               117248    
_________________________________________________________________
dense_33 (Dense)             (None, 500)               64500     
_________________________________________________________________
dense_34 (Dense)             (None, 2000)              1002000   
Total params: 1,383,748
Trainable params: 1,383,748
Non-trainable params: 0
_________________________________________________________________


In [241]:
## compile and fit model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=1024, epochs=100, validation_data=(X_test, y_test))

Train on 58577 samples, validate on 6509 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100

KeyboardInterrupt: 

In [256]:
## generate new tweets
tweet = ""
seed = "i am very"
tweet = tweet + seed
for i in range(1000):
    pred = np.argmax(model.predict(np.array(tokenizer.texts_to_sequences([seed]))))
    pred_word = tokenizer.sequences_to_texts([[pred]])[0]
    if pred_word == 'endoftweet':
        break
    seed_list = seed.split()
    seed_list[0] = seed_list[1]
    seed_list[1] = seed_list[2]
    seed_list[2] = pred_word
    seed = ' '.join(seed_list)
    tweet = tweet + ' ' + pred_word
print(tweet)

i am very proud of you


## <center> Activity

Choose one of the following options: <br> <br>
1) <b>Toxic Comment Classification</b> - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview
    - Build a neural network model that can accurately classify online comments as toxic/non-toxic.
2) <b>Project Gutenberg Book Text Generation</b> - https://www.gutenberg.org/ebooks/search/?sort_order=downloads
    - Choose a book and develop a neural network that can generate realistic text in the same style.