## Natural Language Processing

In [1]:
# Nlp pretrained model -> actually a language model

# Language model: Special kind of model that has been
# trained to guess what the next word in a text.
# we don't need to give labels to model, it has a process
# to automatically get labels from the data.

#Self supervised learning: Training a model using labels 
#that are embedded in the independent variable, rather 
#than requiring external labels. For instance, 
#training a model to predict the next word in a text.

# Universal Language Model Fine-tuning(ULMFit): An extra
# stage of fine-tuning of the language model, prior to 
# transfer learning to a classification task.
# so, fine-tune the pretrianed language model, which was
# trained only on wikipedia articles; this will result
# the model is good at
# predicting the next word of the movie review.


In [3]:
# Text preprocessing
# Step 1 : first we need to concatenate all of the docx to a long string;
# and split into words(tokens)
# Step 2: Our independent variable will be the sequence of words
# starting with the first work in ut very long list and ending with the
# second to last.
# Step 3: our dependent variable will be the sequence of words starting 
# with the second word and ending with the last word

# * we use the corresponding row in the embedding matrix, for those
# words that are in the vocabulary list of our pretrained model.

# * for new words we will just initialize the corresponding row with a 
# random vector.


In [4]:
# Tokenization (Convert the text into a list of words)
# Token  One element of a list created by the tokenization process. It could be a word, part of a word (a subword), or a single character.
# Word-based: Split a sentence on spaces
# Subword based: Split words into smaller parts, based on the most commonly
# occuring substrings. For instance, "Occasion" might be tokenized as "o c ca sion."

# Character-based: Split a sentence into its individual characters


In [5]:
from fastai.text.all import *

In [7]:
path = untar_data(URLs.IMDB)

In [8]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [21]:
txt = files[0].open().read()
txt[:40]

'Feeling Minnesota, directed by Steven Ba'

In [42]:
# fastai WordTokenizer uses Spacy util now.
spacy = WordTokenizer()
toks = first(spacy([txt])) # first index 0 element
print(coll_repr(toks, 10)) # coll_repr(collection, n) function to display the results. n items of collections

(#138) ['Feeling','Minnesota',',','directed','by','Steven','Baigelmann',',','and','starring'...]


In [33]:
first(spacy(['hi lalkrishna! how are you, what about your goals']))

(#11) ['hi','lalkrishna','!','how','are','you',',','what','about','your'...]

In [37]:
# Fastai additional functionality in Tokenizer class
tkn = Tokenizer(spacy)

In [41]:
coll_repr(tkn(txt),10)

"(#162) ['xxbos','xxmaj','feeling','xxmaj','minnesota',',','directed','by','xxmaj','steven'...]"

In [43]:
# xxmaj(beginning of stream) -> next work will start will capital letter
# or model need to forget what was said previously and focus on upcoming words.

# xxbos -> Start of the document
# xxunk -> word is unknown
# the reason why we do it, is that the cap version and lower case version 
# gone be two words in embedding matrixs.
# sometimes cap might matter.


In [48]:
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [52]:
# fix_html: replaces special Html characters with a readable version
# replace_rep: replaces any character repeated 3 times or more with a special
# token for repetition (xxrep), number of times it's repeated, then the character.

# spec_add_spaces: adds spaces around / and #
# rm_useless_spaces: removes all repetitions of the space character
# replace_all_caps: lowercases a word(caps) -> add token (xxup) in front of it
# replace_maj: lowercases a capitalized word -> add token(xxmaj)
# lowercase: lowecases all text -> add beginning xxbox or at the end xxeos

In [53]:
coll_repr(tkn('&copy;   Fast.ai www.fast.ai/INDEX'), 31)

"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup','index']"

In [None]:
coll_repr(tkn('copy;   Fast.ai www.fast.ai/INDEX'), 31)