## Natural Language Processing

In [1]:
# Nlp pretrained model -> actually a language model

# Language model: Special kind of model that has been
# trained to guess what the next word in a text.
# we don't need to give labels to model, it has a process
# to automatically get labels from the data.

#Self supervised learning: Training a model using labels 
#that are embedded in the independent variable, rather 
#than requiring external labels. For instance, 
#training a model to predict the next word in a text.

# Universal Language Model Fine-tuning(ULMFit): An extra
# stage of fine-tuning of the language model, prior to 
# transfer learning to a classification task.
# so, fine-tune the pretrianed language model, which was
# trained only on wikipedia articles; this will result
# the model is good at
# predicting the next word of the movie review.


In [2]:
# Text preprocessing
# Step 1 : first we need to concatenate all of the docx to a long string;
# and split into words(tokens)
# Step 2: Our independent variable will be the sequence of words
# starting with the first work in ut very long list and ending with the
# second to last.
# Step 3: our dependent variable will be the sequence of words starting 
# with the second word and ending with the last word

# * we use the corresponding row in the embedding matrix, for those
# words that are in the vocabulary list of our pretrained model.

# * for new words we will just initialize the corresponding row with a 
# random vector.


In [3]:
# Tokenization (Convert the text into a list of words)
# Token  One element of a list created by the tokenization process. It could be a word, part of a word (a subword), or a single character.
# Word-based: Split a sentence on spaces
# Subword based: Split words into smaller parts, based on the most commonly
# occuring substrings. For instance, "Occasion" might be tokenized as "o c ca sion."

# Character-based: Split a sentence into its individual characters


In [2]:
from fastai.text.all import *
from IPython.display import display,HTML

In [3]:
path = untar_data(URLs.IMDB)

In [6]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [7]:
txt = files[0].open().read()
txt[:40]

'Feeling Minnesota, directed by Steven Ba'

In [8]:
# fastai WordTokenizer uses Spacy until now.
spacy = WordTokenizer()
toks = first(spacy([txt])) # first index 0 element
print(coll_repr(toks, 10)) # coll_repr(collection, n) function to display the results. n items of collections

(#138) ['Feeling','Minnesota',',','directed','by','Steven','Baigelmann',',','and','starring'...]


In [9]:
first(spacy(['hi lalkrishna! how are you, what about your goals']))

(#11) ['hi','lalkrishna','!','how','are','you',',','what','about','your'...]

In [10]:
# Fastai additional functionality in Tokenizer class
tkn = Tokenizer(spacy)

In [11]:
coll_repr(tkn(txt),10)

"(#162) ['xxbos','xxmaj','feeling','xxmaj','minnesota',',','directed','by','xxmaj','steven'...]"

In [12]:
# xxmaj(beginning of stream) -> next work will start will capital letter
# or model need to forget what was said previously and focus on upcoming words.

# xxbos -> Start of the document
# xxunk -> word is unknown
# the reason why we do it, is that the cap version and lower case version 
# gone be two words in embedding matrixs.
# sometimes cap might matter.


In [13]:
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [14]:
# fix_html: replaces special Html characters with a readable version
# replace_rep: replaces any character repeated 3 times or more with a special
# token for repetition (xxrep), number of times it's repeated, then the character.

# spec_add_spaces: adds spaces around / and #
# rm_useless_spaces: removes all repetitions of the space character
# replace_all_caps: lowercases a word(caps) -> add token (xxup) in front of it
# replace_maj: lowercases a capitalized word -> add token(xxmaj)
# lowercase: lowecases all text -> add beginning xxbox or at the end xxeos

In [15]:
coll_repr(tkn('&copy;   Fast.ai www.fast.ai/INDEX'), 31)

"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup','index']"

In [16]:
coll_repr(tkn('T4wads dake 2la Ad'), 31)

"(#7) ['xxbos','xxmaj','t4wads','dake','2la','xxmaj','ad']"

## Subwork tokenization

In [17]:
# some languages don't uses space. to handle these cases
# we use subword tokenization -> language like chines, japanese

# Step 1: Analyze a corpus of documents to find the most commonly 
#occurring groups of letters. These become the vocab.

# Step 2: Tokenize the corpus using this vocab of subword units.

In [18]:
txts = L(o.open().read() for o in files[:1000])
first(txts)[:75]

'Feeling Minnesota, directed by Steven Baigelmann, and starring Keanu Reeves'

In [19]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz) 
    # this will train/ find the commonly occuring groups of letter
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [20]:
subword(6000)
# _ is a special character represent space character in the original text.

"▁F e el ing ▁Minne so ta , ▁directed ▁by ▁Steven ▁Baigelman n , ▁and ▁ starring ▁K eanu ▁Reeve s , ▁Camer on ▁Diaz ▁and ▁Vince nt ▁D ' On of ri o : ▁The ▁strain ed ▁relationship ▁between"

In [21]:
subword(200)

'▁ F e e l ing ▁ M in n es o t a , ▁d i re c t ed ▁b y ▁S t e ve n ▁ B a i g e l m an n , ▁and'

In [22]:
subword(500)

'▁F e el ing ▁M in ne s o t a , ▁ direct ed ▁by ▁S te ve n ▁B a ig el man n , ▁and ▁st ar r ing ▁K e an u ▁R e e ve'

In [23]:
# a large vocab means fewer tokens per sentence.
# downside -> large embedding matrices. also require more data to learn.

## Numericalization

In [24]:
# Numericalization with fastai
# is the process of mapping tokens to integers.
# Step1: Make a list of all possible levels of that categorical variable (the vocab).
# Step2: Replace each level with its index in the vocab.

In [22]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#162) ['xxbos','xxmaj','feeling','xxmaj','minnesota',',','directed','by','xxmaj','steven','xxmaj','baigelmann',',','and','starring','xxmaj','keanu','xxmaj','reeves',',','xxmaj','cameron','xxmaj','diaz','and','xxmaj','vincent',"d'onofrio",':','xxmaj','the'...]


In [23]:
toks400 = txts[:400].map(tkn)
first(toks400)

(#162) ['xxbos','xxmaj','feeling','xxmaj','minnesota',',','directed','by','xxmaj','steven'...]

In [24]:
num = Numericalize() #min_freq = 3 means any word apper less than 3 times is replaced with xxunk
num.setup(toks400)
coll_repr(num.vocab, 20)

"(#3312) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','and','of','to','is','i','in','it'...]"

In [25]:
nums = num(toks)[:20]
nums

TensorText([   2,    8,  919,    8,    0,   11,  587,   49,    8, 1647,    8,    0,
          11,   13,  985,    8,    0,    8,    0,   11])

In [26]:
num.vocab[2]

'xxbos'

In [27]:
' '.join(num.vocab[o] for o in nums)

'xxbos xxmaj feeling xxmaj xxunk , directed by xxmaj steven xxmaj xxunk , and starring xxmaj xxunk xxmaj xxunk ,'

## Put txts into batches

In [28]:
stream = txt
tokens = tkn(stream)
bs, seq_len = 6, 15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])

In [29]:
d_tokens[0]

array(['xxbos', 'xxmaj', 'feeling', 'xxmaj', 'minnesota', ',', 'directed',
       'by', 'xxmaj', 'steven', 'xxmaj', 'baigelmann', ',', 'and',
       'starring'], dtype='<U12')

In [30]:
tokens[0*15:(0+1)*15]

(#15) ['xxbos','xxmaj','feeling','xxmaj','minnesota',',','directed','by','xxmaj','steven'...]

In [31]:
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False, header=None)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
xxbos,xxmaj,feeling,xxmaj,minnesota,",",directed,by,xxmaj,steven,xxmaj,baigelmann,",",and,starring
xxmaj,keanu,xxmaj,reeves,",",xxmaj,cameron,xxmaj,diaz,and,xxmaj,vincent,d'onofrio,:,xxmaj
the,strained,relationship,between,two,brothers,",",xxmaj,sam,(,d'onofrio,),and,xxmaj,jjaks
(,reeves,),",",is,pushed,to,breaking,point,when,xxmaj,jjaks,arrives,at,xxmaj
sam,'s,wedding,and,makes,off,with,the,bride,",",xxmaj,freddie,(,diaz,)
",",a,former,stripper,",",marrying,xxmaj,sam,to,repay,a,gambling,debt,owed,to


In [35]:
# We can't directly feed the df bcoz a single batch containing all the texts would fit in our GPU memory

In [36]:
#First Minibatch
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
xxbos,xxmaj,feeling,xxmaj,minnesota
xxmaj,keanu,xxmaj,reeves,","
the,strained,relationship,between,two
(,reeves,),",",is
sam,'s,wedding,and,makes
",",a,former,stripper,","


In [37]:
#Then this one
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
",",directed,by,xxmaj,steven
xxmaj,cameron,xxmaj,diaz,and
brothers,",",xxmaj,sam,(
pushed,to,breaking,point,when
off,with,the,bride,","
marrying,xxmaj,sam,to,repay


In [38]:
# The above functionality is achieved through fastai LMDataLoader

In [39]:
num400 = [num(i) for i in toks400]
# or toks400.map(num)

In [40]:
num400[0][:20]

TensorText([   2,    8,  919,    8,    0,   11,  587,   49,    8, 1647,    8,    0,
          11,   13,  985,    8,    0,    8,    0,   11])

In [41]:
dl = LMDataLoader(num400)

In [42]:
x, y = first(dl)
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [43]:
# this is we want x and y

In [44]:
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos xxmaj feeling xxmaj xxunk , directed by xxmaj steven xxmaj xxunk , and starring xxmaj xxunk xxmaj xxunk ,'

In [45]:
' '.join(num.vocab[o] for o in y[0][:20])

'xxmaj feeling xxmaj xxunk , directed by xxmaj steven xxmaj xxunk , and starring xxmaj xxunk xxmaj xxunk , xxmaj'

## Language model using DataBlock

In [46]:
#fastai automatically handles tokenization and numericalization 
# when TextBlock is passed in DataBlock

In [4]:
get_imdb = partial(get_text_files, folders = ['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks = TextBlock.from_folder(path, is_lm = True),
    get_items=get_imdb,
    splitter=RandomSplitter(0.1)
).dataloaders(path, bs=80, seq_len = 50)

In [21]:
dls_lm.show_batch(max_n=2)
# we can see that the dependent variable is offset by one

Unnamed: 0,text,text_
0,"xxbos xxmaj this wonderfully witty comedy - drama wowed the crowd at the xxmaj philadelphia xxmaj film xxmaj festival , whipping them into wild applause at its conclusion . xxmaj xxunk by adept performances by a nuanced cast , sturdy","xxmaj this wonderfully witty comedy - drama wowed the crowd at the xxmaj philadelphia xxmaj film xxmaj festival , whipping them into wild applause at its conclusion . xxmaj xxunk by adept performances by a nuanced cast , sturdy execution"
1,", do n't get mad , get even ! ah , and one more thing - just do n't get confused about xxmaj udo xxmaj kier 's performance , it 's a great cameo but just a cameo , same","do n't get mad , get even ! ah , and one more thing - just do n't get confused about xxmaj udo xxmaj kier 's performance , it 's a great cameo but just a cameo , same with"


## Fine tuning the language model

In [24]:
# We use rnn using an architecture called AWD-LSTM
# so the embeddings in the pretrained model are merged with 
# random embeddings added for words that weren't in the pretraining vocab.
# This is handled automatically inside language_model_learner


In [5]:
learn = language_model_learner(
        dls_lm, AWD_LSTM, drop_mult=0.3,
        metrics=[accuracy, Perplexity()]
).to_fp16()

In [23]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.066523,3.942662,0.297687,51.555634,1:21:38


## Saving model

In [25]:
learn.save('1epoch.pth')

Path('models/1epoch.pth.pth')

In [6]:
learn = learn.load('1epoch.pth')

In [7]:
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time


KeyboardInterrupt: 

In [8]:
torch.device

torch.device