In [1]:
import nltk
import re
import os
import array
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras 
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout,GRU
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from nltk.tokenize import word_tokenize
from pickle import dump
from keras.utils.np_utils import to_categorical

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def tokenize(data):
  tokens = re.sub(r'[,!?;-]+', '.', data)
  tokens = nltk.word_tokenize(tokens)  # tokenize string to words
  tokens = [ ch.lower() for ch in tokens if ch.isalpha() or ch == '.'] #Check if all the characters in the text are letters
  return tokens

def Create_tokens(filename): 
  file = open(filename, 'r')
  text = file.read()
  token = tokenize(text)
  file.close()
  return token

####Note: Train size is too small , for better result provide large data for training

In [3]:
train_token= Create_tokens('train.europarl')
val_token= Create_tokens('dev.europarl')
test_token= Create_tokens('test.europarl')
# train_token= Create_tokens('train.fr')
# val_token= Create_tokens('dev.fr')
# test_token= Create_tokens('test.fr')
total_tokens = []
total_tokens.extend(train_token)
total_tokens.extend(val_token)
total_tokens.extend(test_token)
total_tokens.extend(['unk'])
vocab = sorted(list(set(total_tokens)))

In [4]:
print(total_tokens[:20])
print('Total Tokens: %d' % len(total_tokens))
print('Unique Tokens: %d' % len(vocab))

['resumption', 'of', 'the', 'session', 'i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', 'december', 'and', 'i', 'would']
Total Tokens: 98353
Unique Tokens: 6061


In [5]:
def word2Index(total_tokens,vocab):
  idx = 0
  word2Ind = dict()
  for w in vocab:
    word2Ind[w]=idx
    idx+=1
  return word2Ind

word2Ind = word2Index(total_tokens,vocab)
len(word2Ind)

6061

In [6]:
import pickle
w2I = open('word2Index', 'wb')
pickle.dump(word2Ind,w2I)
w2I.close()

In [7]:
def Seq_of_Tokens(tokens,vocab,word2Ind):
 length = 11
 sequences,X,y=list(),list(),list()
 
 for i in range(length, len(tokens)):
	 seq = tokens[i-length:i] # select sequence of tokens
	 temp_x,temp_y=list(),list()
	 temp_y.append(word2Ind[seq[-1]])
	 
	 for i in range(0,length-1):
		 temp_x.append(word2Ind[seq[i]])
	 line = ' '.join(seq) # convert into a line
	 sequences.append(line) # store
	 X.append(temp_x)
	 y.append(temp_y)
 return sequences,np.array(X),np.array(y) 

In [8]:
train_seq,X_train,y_train = Seq_of_Tokens(train_token,vocab,word2Ind)
val_seq,X_val,y_val = Seq_of_Tokens(val_token,vocab,word2Ind)
test_seq,X_test,y_test = Seq_of_Tokens(test_token,vocab,word2Ind)
seq_length = X_train.shape[1]
vocab_size = len(vocab)

In [9]:
print("############# TRAIN ################\n")
print("train sequences : ",train_seq[:10])
print("train data features : ",X_train[:10])
print("train data labels : ",y_train[:10])
print(X_train.shape,y_train.shape,'\n')

print("############# VAL ################\n")
print("Val sequences : ",val_seq[:10])
print("Val data features : ",X_val[:10])
print("Val data labels : ",y_val[:10])
print(X_val.shape,y_val.shape,'\n')

print("############# TEST ################\n")
print("Test sequences : ",test_seq[:10])
print("Test data features : ",X_test[:10])
print("Test data labels : ",y_test[:10])
print(X_test.shape,y_test.shape,'\n')

############# TRAIN ################

train sequences :  ['resumption of the session i declare resumed the session of the', 'of the session i declare resumed the session of the european', 'the session i declare resumed the session of the european parliament', 'session i declare resumed the session of the european parliament adjourned', 'i declare resumed the session of the european parliament adjourned on', 'declare resumed the session of the european parliament adjourned on friday', 'resumed the session of the european parliament adjourned on friday december', 'the session of the european parliament adjourned on friday december and', 'session of the european parliament adjourned on friday december and i', 'of the european parliament adjourned on friday december and i would']
train data features :  [[4668 3696 5432 4879 2680 1363 4667 5432 4879 3696]
 [3696 5432 4879 2680 1363 4667 5432 4879 3696 5432]
 [5432 4879 2680 1363 4667 5432 4879 3696 5432 1960]
 [4879 2680 1363 4667 5432 4879

In [10]:
batch_size = 1000
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

In [11]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 50)            303050    
                                                                 
 lstm (LSTM)                 (None, 10, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 6061)              612161    
                                                                 
Total params: 1,066,111
Trainable params: 1,066,111
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
Y_val = to_categorical(y_val,num_classes=vocab_size)
def Train_model(model,name,train_dataset,vocab_size,X_val,Y_val):
  for step,(X_batch_train,y_batch_train) in enumerate(train_dataset):
    y_t=to_categorical(y_batch_train,num_classes=vocab_size)
    model.fit(X_batch_train,y_t,epochs=1,verbose=2,validation_data=(X_val, Y_val))
  return model

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model = Train_model(model,'euro',train_dataset,vocab_size,X_val,Y_val)

32/32 - 11s - loss: 5.7844 - accuracy: 0.0890 - val_loss: 6.7419 - val_accuracy: 0.0878 - 11s/epoch - 334ms/step
32/32 - 6s - loss: 5.9003 - accuracy: 0.0890 - val_loss: 6.9253 - val_accuracy: 0.0878 - 6s/epoch - 172ms/step
32/32 - 5s - loss: 6.0486 - accuracy: 0.1010 - val_loss: 6.7808 - val_accuracy: 0.0878 - 5s/epoch - 171ms/step
32/32 - 5s - loss: 6.4532 - accuracy: 0.0940 - val_loss: 6.6251 - val_accuracy: 0.0878 - 5s/epoch - 170ms/step
32/32 - 5s - loss: 6.2742 - accuracy: 0.0970 - val_loss: 6.5056 - val_accuracy: 0.0878 - 5s/epoch - 172ms/step
32/32 - 5s - loss: 6.1181 - accuracy: 0.0980 - val_loss: 6.4529 - val_accuracy: 0.0878 - 5s/epoch - 169ms/step
32/32 - 6s - loss: 6.1443 - accuracy: 0.0830 - val_loss: 6.4296 - val_accuracy: 0.0878 - 6s/epoch - 175ms/step
32/32 - 6s - loss: 6.1259 - accuracy: 0.0920 - val_loss: 6.3778 - val_accuracy: 0.0878 - 6s/epoch - 175ms/step
32/32 - 6s - loss: 6.2174 - accuracy: 0.0900 - val_loss: 6.3367 - val_accuracy: 0.0878 - 6s/epoch - 176ms/step

In [16]:
model.save('model_en.h5')
# model.save('model_fr.h5')

Loading the model for calcualting probablity on text data

In [17]:
model = keras.models.load_model('model_en.h5')
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 50)            303050    
                                                                 
 lstm (LSTM)                 (None, 10, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 6061)              612161    
                                                                 
Total params: 1,066,111
Trainable params: 1,066,111
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
def calculate_perplexity_of_text(model,X):
  y_prob = []
  for x in X:
    x = x.reshape((1,10))
    y_pred = model.predict(x)
    y_pred = y_pred.flatten()
    y_prob.append(max(y_pred))
  return y_prob

def save_perplex_of_sen(filename,sequences,y_prob):
  total_perplex = 0
  for seq,y_ in zip(sequences,y_prob):
    temp_perplex = (1/y_)**(0.1)
    total_perplex += temp_perplex
    filename.write(seq+"\t"+str(temp_perplex)+"\n")
  filename.write("Average perplexity for text data is : "+str(total_perplex/len(y_prob))+"\n")
  return total_perplex/len(y_prob)

In [19]:
y_prob = calculate_perplexity_of_text(model,X_test)

In [20]:
testfile = open('test-perplexity_en.txt','w+')
avg_perplex = save_perplex_of_sen(testfile,test_seq,y_prob)
print("Average perplexity of test text : ",avg_perplex)

In [21]:
X_train = X_train[:30000]
train_seq = train_seq[:30000]

In [22]:
X_train.shape,X_test.shape,len(train_seq)

((30000, 10), (27428, 10), 30000)

In [23]:
y_prob = calculate_perplexity_of_text(model,X_train)

In [24]:
trainfile = open('train-perplexity_en.txt','w+')
avg_perplex = save_perplex_of_sen(trainfile,train_seq,y_prob)
print("Average perplexity of train text : ",avg_perplex)

Average perplexity of train text : 1.2643583276802925
