In [None]:
import sys
import re
import urllib
import random
import nltk
nltk.download('words')

from keras.preprocessing.sequence import pad_sequences
from nltk.stem import WordNetLemmatizer
from keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np
import pickle

random.seed(42)

In [None]:
data_path = "https://raw.githubusercontent.com/nee2shaji/IIIT_Sem4/main/brown.txt"
response = urllib.request.urlopen(data_path)
browntext = response.read()
browntext = browntext.decode("utf-8")
browntext = re.sub(r'[\r\n]+', '', browntext)
browntext = browntext.split('.')

for i in range(0, len(browntext)):
    browntext[i] = re.sub(r'[0-9_]+', '', browntext[i])
    browntext[i] = re.sub(r'n\'t', 'nt', browntext[i])
    browntext[i] = re.sub(r'\'ll', ' will', browntext[i])
    browntext[i] = re.sub(r'\'ve', ' have', browntext[i])
    browntext[i] = re.sub(r'\'re', ' are', browntext[i])
    browntext[i] = re.sub(r' i\'m ', ' i am ', browntext[i])
    browntext[i] = re.sub(r'\'s', 's', browntext[i])
    browntext[i] = re.sub(r'[^\w\s]', ' ', browntext[i])
    browntext[i] = re.sub(r'[\s]+', ' ', browntext[i])
    browntext[i] = re.sub(r'^ ', '', browntext[i])
    browntext[i] = browntext[i].lower()

# Check if in english dict
words = set(nltk.corpus.words.words())
for i in range(0, len(browntext)):
  browntext[i] = " ".join([ w for w in browntext[i].split() if w in words ])

dummy = (' ').join(browntext)
req = nltk.FreqDist(dummy.split())
for i in range(0, len(browntext)):
  browntext[i] = " ".join([ w for w in browntext[i].split() if req[w] > 20 ])

# lemmatizer = WordNetLemmatizer()
# for i in range(0, len(browntext)):
#     browntext[i] = " ".join([lemmatizer.lemmatize(w) for w in browntext[i].split() ])

# shuffle sentences and split into train test and validation
random.shuffle(browntext)
browntext_len = len(browntext)
i = round(browntext_len*0.5)
j = round(browntext_len*0.8)
train_set = browntext[0:i]
validation_set = browntext[i:j]
test_set = browntext[j:]
print(len(browntext), len(train_set), len(test_set), len(validation_set), i, j)


In [None]:
# dummy = (' ').join(train_set)

# print(len(set(dummy.split())))

# words = set(nltk.corpus.words.words())

# print(len([ w for w in set(dummy.split()) if(w in words)] ))
# req = nltk.FreqDist(dummy.split())
# i=0
# for k,v in req.items():
#   if(v==2):
#     print(str(k) + ': ' + str(v))
#     i=i+1
# print (i)
# print(len([ w for w in set(dummy.split()) if(w in words)] ))

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_set)
total_vocab = len(tokenizer.word_index) + 1
print(total_vocab)

input_sequences = []
for line in train_set:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])


In [None]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


In [None]:
train_X, train_y = input_sequences[:,:-1],input_sequences[:,-1]
train_y = ku.to_categorical(train_y, num_classes=total_vocab)


In [None]:
def create_model(train_X, train_y, max_sequence_len, total_vocab):
    model = Sequential()
    model.add(Embedding(total_vocab, 10, input_length=max_sequence_len))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(total_vocab, activation='softmax'))    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')
    print(model.summary())
    model.fit(train_X, train_y, epochs=10, verbose=1)
    return model
#     model = Sequential()
#   model.add(Embedding(total_vocab, 10, input_length=input_len))
#   model.add(LSTM(10))
#   model.add(Dropout(0.1))
#   model.add(Dense(1, activation='softmax'))
#   model.compile(loss='categorical_crossentropy', optimizer='adam')
#   model.fit(train_X, train_y, epochs=5, verbose=1)

In [None]:
model = create_model(train_X, train_y, max_sequence_len, total_vocab)