# Phase 5 - POS tagging and insertion of Singlish words

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## IMPORT RELEVANT LIBRARIES

In [None]:
# import the nltk library to do POS tagging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# import other libraries
import numpy as np
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

## DECLARING CONSTANTS

In [None]:
# FIXED CONSTANTS
WORD_LIMIT = 500000
LENGTH = 50 + 1  # input seed (50) + predicted output (1)
NO_OF_OUTPUT_WORDS = 30  # number of text predicted/generated

# file paths of the dataset
DATA = ["SgCorpus"]
COLAB_FILEPATH = './drive/My Drive/next-sentence-predictor/finalData/'
WEIGHTS_DIR = './drive/My Drive/next-sentence-predictor/saved_weights/'

## LOAD & PREPROCESS THE DATA

In [None]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    text = text.lower()
    # close the file
    file.close()
    return text

# FUNCTION TO FURTHER CLEAN THE SCRAPED DATASET
def clean_txt(doc):
    tokens = doc.split()
    table = str.maketrans('','',string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [None]:
# DECLARE THE FINAL FILEPATH TO LOAD THE DATA
filename = COLAB_FILEPATH + DATA[0] + '.txt'
# CALL FUNCTION TO LOAD RAW DATA
data = load_doc(filename)

# PASS THE RAW DATA INTO THE FUNCTION
tokens = clean_txt(data)

# GET WORD SEQUENCES
lines = list()
for i in range(LENGTH, len(tokens),2): # skip a word for each word sequences
    seq = tokens[i-LENGTH:i]
    line = ' '.join(seq)
    lines.append(line)
    # resource constraint (colab RAM), 
    # take only the first (number of words =  WORD_LIMIT) it encounters
    if i > WORD_LIMIT:
        break

# TOKENIZE TEXT SEQUENCE
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# convert to numpy array
sequences = np.array(sequences)

# assign X and y
X, y = sequences[:,:-1], sequences[:,-1]

# SIZE OF THE VOCAB
vocab_size = len(tokenizer.word_index) + 1

# one-hot the output y
y = to_categorical(y, num_classes=vocab_size, dtype='int8')

# GET THE SEQUENCE LENGTH
seq_length = X.shape[1]

# OUTPUT DIMENSION OF THE EMBEDDING LAYER
EM_OUTPUT_LENGTH = 50

## HELPER FUNCTIONS TO LOAD & GENERATE TEXT
-> Load the pre-trained H5 files   
-> Generate text sequence   
-> Show generated text  

In [None]:
# LOAD PRE-TRAINED H5 FILES
# Write a function to load different variant of the pre-trained model
def load_pretrain_model(model_name,batch_size,epochs, optimizer_type):
    filepath = f"{WEIGHTS_DIR}{model_name}-{epochs}_epoch-{batch_size}_batch_size-{optimizer_type}.h5"
    # debug
    #print(filepath)

    # load the model
    model = load_model(filepath)
    #model.summary()

    return model

In [None]:
# GENERATE TEXT SEQUENCES
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
  text = list()

  for _ in range(n_words):
    # [0] - 0th dimension of the array which contains the encoded text (number tag)
    encoded = tokenizer.texts_to_sequences([seed_text])[0] 
    encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')

    # predict the probability of each word
    #y_pred = model.predict_classes(encoded)
    y_pred = np.argmax(model.predict(encoded), axis=-1)

    predicted_word = ''
    for word, index in tokenizer.word_index.items():
      if index == y_pred:
        predicted_word = word
        break
    seed_text = seed_text + ' ' + predicted_word
    text.append(predicted_word)

  return ' '.join(text)

In [None]:
# SHOW GENERATED TEXT
def show_generated_text(model, tokenizer, seq_length, no_of_output_words, own_text="eating and sleeping",seed=12345):
    # SEED TEXT GOTTEN FROM THE CORPUS
    seed_text_from_corpus = lines[seed]
    #print(f"seed_text_from_corpus: {seed_text_from_corpus}")
    #print()

    # OUR OWN SEED TEXT
    seed_text_own = own_text
    #print(seed_text_own)

    # generate next N words after seed text
    # seed text from corpus
    #var_corpus = generate_text_seq(model, tokenizer, seq_length, seed_text_from_corpus, no_of_output_words)
    #print("Text generated from corpus seed text:")
    #print(f"{seed_text_from_corpus} -> {var_corpus}")

    #print()

    # seed text from own text
    var_own = generate_text_seq(model, tokenizer, seq_length, seed_text_own, no_of_output_words)
    print("Text generated from own seed text:")
    print(f"{seed_text_own} {var_own}")
    full_sentence = seed_text_own + " " + var_own
    
    return full_sentence

## HELPER FUNCTIONS FOR POS TAGGING
-> Tagging the generated sentence  
-> Insertion of singlish words into the generated sentence

In [None]:
# Tagging the generated sentence
def sentenceTagging(sentence):
    #Initialize tagged list
    tagged_list = list()

    # Tokenize words
    wordsList = word_tokenize(sentence)
    # removing stop words from wordList 
    wordsList = [w for w in wordsList]  
  
    # Using a Tagger. Which is part-of-speech  
    # tagger or POS-tagger.  
    tagged = nltk.pos_tag(wordsList)
    for item in tagged:
        tagged_list.append(item)
    return tagged_list

In [None]:
# Insertion of singlish words into the generated sentence
def singlify(tagged_sentence, singlish_adj_list):
  singlish_sentence = ""
  for pair in tagged_sentence:
    word = pair[0]
    type_of_word = pair[1]
    # Replace words if they are the following (more Singlish way of representing in text)
    if word == 'zero':
      word = 'jilo'
    elif word == 'copy':
      word = 'kope'
    elif word == 'already':
      word = 'orredy'
    elif word == 'very':
      word = 'very the'
    elif word == 'vomit':
      word = 'womit'
    elif word == 'better':
      word = 'more better'
    
    # Looking for adjectives in the tagged sentence
    if type_of_word == 'JJ':
      singlish_word = random.choice(singlish_adj_list)
      
      # if following word is adjective, insert the Singlish word after the next word
      singlish_sentence = singlish_sentence + " " + word + " " + singlish_word
    else:
      # else just insert the next word
      singlish_sentence = singlish_sentence + " " + word

  # remove the first spacing
  singlish_sentence_ = singlish_sentence[1:]
  
  # Return sentence
  return singlish_sentence_

In [None]:
# a composite function that each of the generated text could call
def insert_singlish_word(raw_sentence, singlish_adj_list):
    tagged_sentence = sentenceTagging(raw_sentence)
    singlish_sentence = singlify(tagged_sentence, singlish_adj_list)

    return singlish_sentence

## LIST OF SINGLISH ADJECTIVE LIST

In [None]:
SINGLISH_ADJ_LIST = ["lah","lorh","leh","sia","mah","meh","kenna","liao","siao"]

## LOAD PRE-TRAINED H5 FILE & SHOW GENERATED TEXT
**Selected models**  
1. Stacked LSTM + Adam + Batch Size 128 + 20 epochs  
2. Stacked LSTM + Adam + Batch Size 128 + 100 epochs  
3. Stacked LSTM + Adam + Batch Size 64 + 100 epochs   
4. Stacked GRU + Adam + Batch Size 64 + 100 epochs

**Selected text starter**  
1. here i am testing my nlg project..  
2. never did i have...  
3. someday i will...

### FIXED CONSTANTS USED

In [None]:
# initialise number of epochs to train the model
EPOCHS = 100
SEED = 1234

## TEXT STARTER 1: here i am testing my nlg project...

In [None]:
# initialise the starting word/phrase of our own text and seed count
OWN_TEXT_STARTER = "here i am testing my nlg project"

#### Stacked LSTM + Adam + 20 Epochs + Batch Size 128

In [None]:
# LSTM + Adam + 20 Epochs + Batch Size 128
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=128, epochs=20, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
here i am testing my nlg project and i am a crush on my og and i am not really nice i am not a crush on my og and i am not really nice i am

Singlish sentence:
here i lorh am testing my nlg project and i am a crush on my og and i am not really nice siao i am not a crush on my og and i am not really nice meh i am


#### Stacked LSTM + Adam + 100 Epochs + Batch Size 128

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 128
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=128, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
here i am testing my nlg project customers subscribe for the latest food news reviewsfacebook daniels food diaryinstagram food videos singapore deli chef recipes off on this new outlet by yong noodle can check out tunglok new

Singlish sentence:
here i lah am testing my nlg project customers subscribe for the latest food news reviewsfacebook daniels food diaryinstagram food videos singapore deli liao chef recipes off on this new siao outlet by yong meh noodle can check out tunglok new sia


#### Stacked LSTM + Adam + 100 epochs + Batch Size 64 

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 64
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=64, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
here i am testing my nlg project back craze spend loo speakers blogger was being that delivering job on drama masks for heavy session at the past courses and hasi is supporting me because the rest is

Singlish sentence:
here i kenna am testing my nlg project back craze kenna spend loo speakers blogger was being that delivering job on drama masks for heavy leh session at the past liao courses and hasi is supporting me because the rest is


#### Stacked GRU + Adam + 100 epochs + Batch Size 64 

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 64
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_GRU", batch_size=64, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
here i am testing my nlg project ceo kate forever awesome ceo air on kandie flats ceo closing forever playing foreign workers wearing todays projects lets actually enjoying playing roasted resort date ceo lee roasted dumplings man

Singlish sentence:
here i sia am testing my nlg project ceo kate forever awesome mah ceo air on kandie flats ceo closing forever playing foreign liao workers wearing todays projects lets actually enjoying playing roasted resort date ceo lee roasted dumplings man


## TEXT STARTER 2: never did i have...

In [None]:
# initialise the starting word/phrase of our own text and seed count
OWN_TEXT_STARTER = "never did i have"

#### Stacked LSTM + Adam + 20 Epochs + Batch Size 128

In [None]:
# LSTM + Adam + 20 Epochs + Batch Size 128
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=128, epochs=20, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
never did i have been a crush on my og is a lot of the best for jubilant you be a good to be a good time to be a good thing in the

Singlish sentence:
never did i have been a crush on my og is a lot of the best for jubilant you be a good liao to be a good sia time to be a good leh thing in the


#### Stacked LSTM + Adam + 100 Epochs + Batch Size 128

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 128
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=128, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
never did i have deserve favours was the schools is chosen thesmartlocalcom away the girls out some ppl hes super terrible bottle of the cable system from stage of aug this was an hat

Singlish sentence:
never did i have deserve favours was the schools is chosen thesmartlocalcom away the girls out some ppl hes super terrible liao bottle of the cable system from stage of aug this was an hat


#### Stacked LSTM + Adam + 100 epochs + Batch Size 64

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 64
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=64, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
never did i have crossed his results at tedx n was my initials community area there meet fighting pls want a wrong impression i forgot with stuff from music dollarsandsense family last list of

Singlish sentence:
never did i have crossed his results at tedx sia n was my initials community area there meet fighting pls want a wrong meh impression i forgot with stuff from music dollarsandsense family last mah list of


#### Stacked GRU + Adam + 100 epochs + Batch Size 64

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 64
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_GRU", batch_size=64, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
never did i have hit forever on becoming awesome earlier ceo water a wealth ceo ceo date ceo rewards roasted diary new view liang coupons live using the high food highlights the worlds area

Singlish sentence:
never did i have hit forever on becoming awesome leh earlier ceo siao water a wealth ceo ceo date ceo rewards roasted diary liao new leh view liang mah coupons live using the high leh food highlights the worlds area


## TEXT STARTER 3: someday i will...

In [None]:
# initialise the starting word/phrase of our own text and seed count
OWN_TEXT_STARTER = "someday i will"

#### Stacked LSTM + Adam + 20 Epochs + Batch Size 128

In [None]:
# LSTM + Adam + 20 Epochs + Batch Size 128
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=128, epochs=20, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
someday i will be shocked on the past years and i am been a good in the end of the best for yur in the end of the end of the end of

Singlish sentence:
someday i will be shocked on the past liao years and i am been a good lah in the end of the best for yur in the end of the end of the end of


#### Stacked LSTM + Adam + 100 Epochs + Batch Size 128

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 128
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=128, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
someday i will be yesterday for frugal type channel with my family destination careerremember of bubble tea at citylink mall danielfooddiarycomwhere to say about the details to become said to get the boyfriend

Singlish sentence:
someday i will be yesterday for frugal lorh type channel with my family destination careerremember of bubble sia tea at citylink mall danielfooddiarycomwhere to say about the details to become said to get the boyfriend


#### Stacked LSTM + Adam + 100 epochs + Batch Size 64

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 64
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_LSTM", batch_size=64, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
someday i will get to pull down and honestly i am sure aj simply dance classics next other sp wins when it was such an amazing picture keep actually peace with of bento

Singlish sentence:
someday i will get to pull down and honestly i siao am sure sia aj simply dance classics next other lorh sp wins when it was such siao an amazing meh picture keep actually peace with of bento


#### Stacked GRU + Adam + 100 epochs + Batch Size 64

In [None]:
# LSTM + Adam + 100 Epochs + Batch Size 64
# load pretrained model
model = load_pretrain_model(model_name="2LAYER_GRU", batch_size=64, epochs=EPOCHS, optimizer_type="Adam")

# generate the sentences given the text starter
full_sentence = show_generated_text(model, tokenizer, seq_length, NO_OF_OUTPUT_WORDS, own_text=OWN_TEXT_STARTER, seed=SEED)

# generate the Singlish sentence 
singlish_sentence = insert_singlish_word(full_sentence, SINGLISH_ADJ_LIST)
print("\nSinglish sentence:")
print(singlish_sentence)

Text generated from own seed text:
someday i will wanted to work on kim gold ceo bright koh lee wine sheng sea coffeehouse ceo daily bellywellyjelly discount at todays date in kandie chen steamboat arrives code koh pink roasted

Singlish sentence:
someday i will wanted to work on kim gold meh ceo bright lah koh lee wine sheng sea coffeehouse ceo lorh daily leh bellywellyjelly discount at todays siao date in kandie chen steamboat arrives code liao koh pink roasted
