<a href="https://colab.research.google.com/github/monilchheda/manning-live-project-building-domain-specific-language-models/blob/master/week4_DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

# data = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.csv.gz', compression='gzip')
df = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.tokenized.csv.gz', compression='gzip').sample(frac = 1, random_state = 8).reset_index(drop = True)

In [0]:
corpus = df[df.category.isin(['title'])].copy()
corpus = corpus[(corpus.n_tokens > 10) & (corpus.n_tokens < 500)].reset_index(drop = True).copy()
corpus.head()
# Need to sample corpus or training takes forever
red_corpus = corpus.sample(10000)

In [3]:
# Fix Tensorflow
%tensorflow_version 2.x
import tensorflow
print(tensorflow.__version__)

2.2.0


In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
texts = red_corpus.tokens

# Update internal vocabulary based on a list of texts
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
print("vocabulary size: %d" %vocab_size)

Using TensorFlow backend.


vocabulary size: 6890


In [5]:
input_sequences = []
for line in texts:
    encoded_tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded_tokens)):
        n_gram_sequence = encoded_tokens[:i+1]
        input_sequences.append(n_gram_sequence)

print('Total Sequences: %d' % len(input_sequences))

Total Sequences: 122904


In [6]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
print('Max Sequence Length: %d' % max_sequence_len)

Max Sequence Length: 33


In [0]:
import keras.utils as ku 

# create predictors and label (X(input) and y(output))
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=vocab_size)

In [8]:
from tensorflow.python.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.python.keras.models import Sequential

model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x=predictors, y=label, batch_size=100, epochs=1, verbose=1, use_multiprocessing=True)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 10)            68900     
_________________________________________________________________
lstm (LSTM)                  (None, 32, 150)           96600     
_________________________________________________________________
dropout (Dropout)            (None, 32, 150)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               100400    
_________________________________________________________________
dense (Dense)                (None, 6890)              695890    
Total params: 961,790
Trainable params: 961,790
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
def generate_text(seed_text, next_words, max_sequence_len, model):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [10]:
texts.sample(10)

10148    how do i calculate one - way lift and two - wa...
16671    what is the best criterion for performance eva...
28328    questions with fitting a dose response curve u...
24884    how to predict a categorical variable with ano...
4199     what can you do when you have predictor variab...
17314    what is the correct test to use for a categori...
27861    please check my steps and reasoning for gettin...
1961     are there other ways to represent time series ...
19837    if random forests gives me a bad cross - valid...
3688     can a t - test p - value be . when the ci for ...
Name: tokens, dtype: object

In [11]:
generate_text("summarize multiple distributions", 10, max_sequence_len, model)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'summarize multiple distributions of the the the the the the the the the'

In [14]:
subset_text = corpus.sample(10000).text.to_list()
print (subset_text)

