In [1]:
%%capture
import re # to handle regular expression
import demoji  # for emojis handle
import random # to generate random number
import inflect  # to handle number to words
import numpy as np # scitific calculation 
import pandas as pd # data manipulation 
from bs4 import BeautifulSoup # handle html tag
import matplotlib.pyplot as plt # plot any display

import torch # deep learning handle
import torchtext # nlp handle
import torch.nn as nn # NN handle
import torch.optim as optim # optimizer handle
import torch.nn.functional as F # all type to DL funciton
from torch.nn.functional import one_hot # encoder-decoder
from torchtext.data.utils import get_tokenizer # tokenize 
print('Successfully import all the libraries')

In [2]:
# Reading the text 
file_path = 'human_chat.txt'
with open(file_path,"r") as f:
    lines = f.readlines()

In [3]:
lines

['Human 1: Hi!\n',
 'Human 2: What is your favorite holiday?\n',
 'Human 1: one where I get to meet lots of different people.\n',
 'Human 2: What was the most number of people you have ever met during a holiday?\n',
 'Human 1: Hard to keep a count. Maybe 25.\n',
 'Human 2: Which holiday was that?\n',
 'Human 1: I think it was Australia\n',
 'Human 2: Do you still talk to the people you met?\n',
 "Human 1: Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them\n",
 'Human 2: Yea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?\n',
 'Human 1: what do you mean?\n',
 'Human 2: I think it\'s like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt

In [4]:
def preprocessing_text_method(text):
    # Remove HTML tag
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    p = inflect.engine() #101
    
    # remove emojis
    text = demoji.replace(text, '')

    # Remove mentions of "Humna 1 and Humen 2"
    text = re.sub(r'\b(?:Human 1 |Human 2)\b:?', " ", text)

    # Replace numbers with words
    text = re.sub(r'\b\d+\b', lambda x:p.number_to_words(x.group()), text)

    # Remove special characters , keeping only alphabetic and space
    text = re.sub('[^a-zA-Z\s]', ' ', text)

    # Replace specifica unicode spaces wiht standad space and trim
    text = text.replace(u'\xa0', u' ').replace('\u200a', ' ').strip()
    
    return text

In [5]:
# Execute the function code
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) # use to handle the error handle of markupresembl
processed_text = [preprocessing_text_method(line) for line in lines]
processed_text[:20]

['Human one  Hi',
 'What is your favorite holiday',
 'Human one  one where I get to meet lots of different people',
 'What was the most number of people you have ever met during a holiday',
 'Human one  Hard to keep a count  Maybe twenty five',
 'Which holiday was that',
 'Human one  I think it was Australia',
 'Do you still talk to the people you met',
 'Human one  Not really  The interactions are usually short lived but it s fascinating to learn where people are coming from and what matters to them',
 'Yea  me too  I feel like God often puts strangers in front of you  and gives you an opportunity to connect with them in that moment in deeply meaningful ways  Do you ever feel like you know things about strangers without them telling you',
 'Human one  what do you mean',
 'I think it s like a  th sense  often seen as  cold readings  to people  but can be remarkably accurate  I once sat next to a man in a coffee and I felt a pain in my back  I asked the stranger if he had a pain  It tur

In [6]:
# Tokenize words
tokenizer = get_tokenizer('basic_english')
tokenized_text = [tokenizer(line) for line in processed_text]

In [7]:
tokenized_text[1]

['what', 'is', 'your', 'favorite', 'holiday']

In [8]:
feature_vocab = torchtext.vocab.build_vocab_from_iterator(
    tokenized_text,
    min_freq=1,
    specials = [
        '<pad>',
        '<oov>'
    ],
    special_first = True
)
taget_vocab = torchtext.vocab.build_vocab_from_iterator(
    tokenized_text,
    min_freq=1
)

In [9]:
features_vocab_total_words = len(feature_vocab)
taget_vocab_total_words = len(taget_vocab)

print('Features Vocab Length:', features_vocab_total_words)
print("Target Vocab Length:", taget_vocab_total_words)

Features Vocab Length: 2749
Target Vocab Length: 2747


In [10]:
# N-gram from the tokenized text
def make_ngrams(tokenized_text):
    list_ngrams = []
    for i in range(1, len(tokenized_text)):
        ngram_sequence = tokenized_text[:i+1]
        list_ngrams.append(ngram_sequence)
    return list_ngrams

In [11]:
# Execute the function and make ngram
ngrams_list = []
for tokenize_con in tokenized_text:
    ngrams_list.extend(make_ngrams(tokenized_text=tokenize_con))

In [12]:
ngrams_list[:10]

[['human', 'one'],
 ['human', 'one', 'hi'],
 ['what', 'is'],
 ['what', 'is', 'your'],
 ['what', 'is', 'your', 'favorite'],
 ['what', 'is', 'your', 'favorite', 'holiday'],
 ['human', 'one'],
 ['human', 'one', 'one'],
 ['human', 'one', 'one', 'where'],
 ['human', 'one', 'one', 'where', 'i']]

In [16]:
import random
# add random oov takens to let the model handle the out of vocabulary takens
def add_random_oov_token(ngram):
    for idx, word in enumerate(ngram[:-1]):
        if random.uniform(0,1) < 0.1:
            ngram[idx] = '<oov>'
    return ngram

In [17]:
ngrams_list_oov = []

for ngram in ngrams_list:
    ngrams_list_oov.append(add_random_oov_token(ngram))
print(any('<oov>' in ngram for ngram in ngrams_list_oov))


True


In [18]:
ngrams_list_oov[:10]

[['human', 'one'],
 ['<oov>', 'one', 'hi'],
 ['what', 'is'],
 ['what', '<oov>', 'your'],
 ['what', 'is', 'your', 'favorite'],
 ['what', 'is', 'your', '<oov>', 'holiday'],
 ['human', 'one'],
 ['human', 'one', 'one'],
 ['human', 'one', 'one', 'where'],
 ['human', 'one', 'one', 'where', 'i']]