# Starting into NLP 

## Importing stuff

In [10]:
%matplotlib inline

from utils import *
model_path = '../data/aclImdb/models'
%mkdir -p $model_path

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [12]:
from keras.datasets import imdb
idx=imdb.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json

## Setting up data

In [17]:
idx_arr = sorted(idx,key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [160]:
idx2word= { v:k for k,v in idx.items()}

In [21]:
path = get_file('imdb_full.pkl','https://s3.amazonaws.com/text-datasets/imdb_full.pkl')

f=open(path,'rb')
(x_train,labels_train),(x_test,labels_test)=pickle.load(f)

In [22]:
len(x_train)

25000

In [41]:
' '.join([idx2word[o] for o in x_train[-1]])

"this is one of the dumbest films i've ever seen it rips off nearly ever type of thriller and manages to make a mess of them all br br there's not a single good line or character in the whole mess if there was a plot it was an afterthought and as far as acting goes there's nothing good to say so ill say nothing i honestly cant understand how this type of nonsense gets produced and actually released does somebody somewhere not at some stage think 'oh my god this really is a load of shite' and call it a day its crap like this that has people downloading illegally the trailer looks like a completely different film at least if you have download it you haven't wasted your time or money don't waste your time this is painful"

In [40]:
labels_train[-1]

0

In [111]:
#Reducing vocab size
vocab_size= 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [112]:
' '.join([idx2word[o] for o in trn[-1]])

"this is one of the bergman films i've ever seen it bergman off nearly ever type of thriller and manages to make a mess of them all br br there's not a single good line or character in the whole mess if there was a plot it was an bergman and as far as acting goes there's nothing good to say so ill say nothing i honestly cant understand how this type of nonsense gets produced and actually released does somebody somewhere not at some stage think bergman my god this really is a load of bergman and call it a day its crap like this that has people bergman bergman the trailer looks like a completely different film at least if you have bergman it you haven't wasted your time or money don't waste your time this is painful"

In [72]:
len(trn)

25000

In [75]:
lens=[len(trn[i]) for i in range(len(trn))]

In [78]:
(max(lens),min(lens),np.mean(lens))

(2493, 10, 237.71364)

In [172]:
#Making all sentences of equal length
seq_len = 500

trn=sequence.pad_sequences(trn,maxlen=seq_len,value=0)
test=sequence.pad_sequences(test,maxlen=seq_len,value=0)

In [173]:
trn.shape

(25000, 500)

## Creating simple models

In [81]:
trn.shape

(25000, 500)

In [84]:
labels_trn=np.asarray(labels_train)

In [85]:
labels_trn.shape

(25000,)

In [100]:
model=Sequential([Embedding(vocab_size,32,input_length=seq_len),
                 Flatten(),
                 Dense(100,activation='relu'),
                 Dropout(0.7),
                 Dense(1,activation='sigmoid')])

In [101]:
model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [102]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [103]:
model.fit(trn,labels_trn,validation_data=(test,labels_test),epochs=2,batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x13c2d2240>

## 1D-CNN model

In [107]:
conv_model = Sequential([
    Embedding(vocab_size,32,input_length=seq_len),
    keras.layers.SpatialDropout1D(0.2),
    Dropout(0.2),
    Conv1D(64,5,padding='same',activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100,activation='relu'),
    Dropout(0.7),
    Dense(1,activation='sigmoid')
])

In [108]:
conv_model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [110]:
conv_model.fit(trn,labels_trn,validation_data=(test,labels_test),epochs=4,batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1296ad320>

## Pre-Trained Vectors

In [117]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('../data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)
    

In [146]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
           pickle.load(open(loc+'_words.pkl','rb'),encoding='utf-8'),
           pickle.load(open(loc+'_idx.pkl','rb'),encoding='utf-8'))

In [147]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

In [149]:
vecs.shape

(400000, 50)

In [151]:
words[0]

'the'

In [152]:
vecs[0]

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01, -4.4457e-02, -4.9688e-01,
       -1.7862e-01, -6.6023e-04, -6.5660e-01,  2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01,
       -9.5095e-03,  1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01, -1.6801e-02,
       -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01, -1.8823e+00, -7.6746e-01,  9.9051e-02,
       -4.2125e-01, -1.9526e-01,  4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02, -2.9871e-01, -1.5749e-01,
       -3.4758e-01, -4.5637e-02, -4.4251e-01,  1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01,
       -7.8581e-01], dtype=float32)

In [159]:
idx2word_glove={v:k for k,v in wordidx.items()}

In [162]:
??normal

In [164]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [165]:
emb=create_emb()

In [181]:
emb[1]

array([ 1.3933e-01,  8.3227e-02, -1.3747e-01,  4.0567e-02,  1.1509e-01, -1.4819e-02, -1.6563e-01,
       -5.9540e-02, -2.2008e-04, -2.1887e-01,  9.2810e-02, -4.9223e-02, -1.8559e-01,  4.8860e-02,
       -3.1698e-03,  3.8860e-03,  3.4013e-02, -4.2640e-02, -2.8143e-01, -4.0603e-02, -5.6003e-03,
       -1.1093e-01, -5.1733e-02, -7.7103e-02, -6.3937e-02, -6.2743e-01, -2.5582e-01,  3.3017e-02,
       -1.4042e-01, -6.5087e-02,  1.3357e+00, -6.1980e-02, -1.7429e-01, -1.0560e-01,  1.9738e-04,
        2.4816e-03,  5.9260e-02, -5.2990e-02,  4.0137e-03, -1.8074e-02, -9.9570e-02, -5.2497e-02,
       -1.1586e-01, -1.5212e-02, -1.4750e-01,  6.2617e-02,  9.2830e-04, -6.1370e-02, -3.8380e-02,
       -2.6194e-01])

In [166]:
model = Sequential([
    Embedding(vocab_size,50,input_length=seq_len,weights=[emb],trainable=False),
    keras.layers.SpatialDropout1D(0.2),
    Dropout(0.25),
    Conv1D(64,5,padding='same',activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100,activation='relu'),
    Dropout(0.7),
    Dense(1,activation='sigmoid')
])

In [167]:
model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [168]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 500, 50)           0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 500, 50)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 500, 64)           16064     
_________________________________________________________________
dropout_13 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 16000)             0         
__________

In [189]:
model.fit(trn,labels_train,validation_data=(test,labels_test),epochs=2,batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x15c4885f8>

In [190]:
model.layers[0].trainable=True

In [191]:
model.optimizertimizer.lr=1e-4

In [192]:
model.fit(trn,labels_train,validation_data=(test,labels_test),epochs=1,batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x15c453780>

## Multi-size *CNN*

In [194]:
from keras.layers import concatenate

In [197]:
graph_in = Input((vocab_size,50))
convs=[];
for fsz in range(3,6):
    x=Conv1D(64,fsz,padding='same',activation='relu')(graph_in)
    x=MaxPooling1D()(x)
    x=Flatten()(x)
    convs.append(x)
out=concatenate(convs)
graph=Model(graph_in,out)

In [198]:
emb=create_emb()

In [199]:
model=Sequential([
    Embedding(vocab_size,50,input_length=seq_len,weights=[emb]),
    Dropout(0.2),
    graph,
    Dropout(0.5),
    Dense(100,activation='relu'),
    Dropout(0.7),
    Dense(1,activation='sigmoid')
])

In [200]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [201]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)



Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x16a261978>

In [202]:
model.layers[0].trainable=False

In [203]:
model.optimizer.lr=1e-5

In [204]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x15b10d898>

## LSTM 

In [206]:
model = Sequential([
    Embedding(vocab_size,32,input_length=seq_len,mask_zero=True,
              embeddings_regularizer=l2(1e-6)),
    LSTM(100,implementation=2),
    Dense(1,activation='sigmoid')
    ])
model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [207]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 500, 32)           160000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [208]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=5, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x14654bcf8>