<a href="https://colab.research.google.com/github/mhuckvale/pals0039/blob/master/Demo_LM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Language modelling demos

##Use NLTK Language Modelling functions for building and testing an n-gram language model

In [0]:
!pip install nltk==3.4.5

In [0]:
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
nltk.download('punkt')
from nltk.lm import Vocabulary
from nltk.util import pad_sequence
from nltk.lm.preprocessing import flatten

text=[]
for s in reuters.sents():
  slower=map(str.lower,s)
  text.append(list(pad_sequence(slower,pad_left=True,left_pad_symbol="<s>",pad_right=True,right_pad_symbol="</s>",n=2)))

for i in range(5):
  print(text[i])

text=list(flatten(text))
print('Text length',len(text))
vocab=Vocabulary(text, unk_cutoff=10)
print('Vocab size',len(vocab))

text=vocab.lookup(text)

print(text[:25])



In [0]:
# collect and display ngram counts
from nltk.util import everygrams
from nltk.lm import NgramCounter
allgrams=list(everygrams(text,min_len=1,max_len=3))
print(len(text),len(allgrams))
ngram_counts = NgramCounter([allgrams])

# display some continuations
print(sorted(ngram_counts[('imports',)].items(),key = lambda x: x[1],reverse=True)[:10])
print(sorted(ngram_counts[('imports','of')].items(),key = lambda x: x[1],reverse=True)[:10])


In [0]:
from nltk.lm.models import Laplace
from nltk.util import ngrams
for order in (1,2,3):
  train=list(ngrams(text[10000:],order))
  test=list(ngrams(text[:10000],order))
  print(train[:10])
  lm = Laplace(order)
  lm.fit([train],vocab)
  print("Train perplexity",lm.perplexity(train[:10000]))
  print("Test perplexity",lm.perplexity(test))


## neural model

In [0]:
import numpy as np

%tensorflow_version 2.x
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, Flatten, LSTM, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [0]:
# build a vocabulary
lexicon={}
for w in text:
  if not w in lexicon:
    lexicon[w]=len(lexicon)

# encode text as numbers
ntext=[lexicon[w] for w in text]
print(text[:25])
print(ntext[:25])

In [0]:
def prepare_sequences(text,seqlen):
  nseq=(len(text)-1)//seqlen
  feats=np.zeros((nseq,seqlen))
  labels=np.zeros((nseq,seqlen))
  for i in range(nseq):
    feats[i,:]=text[i*seqlen:i*seqlen+seqlen]       # input is text sequence
    labels[i,:]=text[i*seqlen+1:i*seqlen+seqlen+1]  # output is text sequence advanced by 1
  return feats,labels

seqlen=100
Xtrain,ytrain = prepare_sequences(ntext[10000:],seqlen)
Xtest,ytest = prepare_sequences(ntext[:10000],seqlen)

print(Xtrain.shape,ytrain.shape)
print(Xtest.shape,ytest.shape)



In [0]:
import tensorflow as tf
def perplexity(y_true, y_pred):
    cross_entropy = tf.losses.sparse_categorical_crossentropy(y_true, y_pred)
    perplexity = tf.exp(tf.reduce_mean(cross_entropy))
    return perplexity


isize=len(lexicon)
osize=len(lexicon)

model = Sequential()
model.add(Embedding(input_dim=isize, output_dim=64,input_length=seqlen))
model.add(LSTM(32,return_sequences=True,activation='tanh'))
model.add(LSTM(32,return_sequences=True,activation='tanh'))
model.add(TimeDistributed(Dense(osize, activation='softmax')));
#
# compile the network
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=[perplexity])
print(model.summary())

In [0]:
# train the model
history=model.fit(Xtrain,ytrain, batch_size=64, validation_data=(Xtest,ytest), epochs=40)

In [0]:
# generate from trained model

# build a reverse dictionary
reverse_lexicon={}
for k,v in lexicon.items():
  reverse_lexicon[v]=k

# sample from a probability distribution
def sampledist(dist):
  thresh=np.random.random()
  sum=0
  for i in range(len(dist)):
    sum += dist[i]
    if sum > thresh:
      return(i)
  return(0)

# pick a starting seed
inptext="<s> it is a fact that"

# encode as numbers
pattern=[lexicon[vocab.lookup(w)] for w in inptext.split(' ')]
pat_len=len(pattern)
pattern=pad_sequences([pattern],maxlen=seqlen,padding='post')

# generate words
for i in range(seqlen-pat_len):
  prediction = model.predict(pattern, verbose=0)
  prediction = prediction[0]
  pattern[0][pat_len]=sampledist(prediction[pat_len,:])
  pat_len += 1

sentence=[]
for i in range(pat_len):
  sentence.append(reverse_lexicon[pattern[0][i]])
print(' '.join(sentence))
