# Universal Language Model Fine Tuning (ULMFiT)

In [1]:
import os
import io
import numpy as np
import pandas as pd
from functools import partial
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords 

import fastai
from fastai import *
from fastai.text import *

## Load Datasets

In [2]:
datasets = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = datasets.data

In [3]:
df_documents = pd.DataFrame({'label': datasets.target, 'text': documents})

In [4]:
df_documents = df_documents[df_documents['label'].isin([1,10])]
df_documents = df_documents.reset_index(drop = True)

In [5]:
df_documents = df_documents[df_documents['label'].isin([1,10])]
df_documents = df_documents.reset_index(drop = True)

In [6]:
df_documents.head()

Unnamed: 0,label,text
0,10,"Well, I will have to change the scoring on my ..."
1,1,Archive-name: graphics/resources-list/part1\nL...
2,10,"\nAnd of course, Mike Ramsey was (at one time)..."
3,10,"As I promised, I would give you the name of th..."
4,10,GAME(S) OF 4/15\n---------------\nADIRONDACK 6...


In [7]:
df_documents['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

## Preprocess Datasets

#### Remove non-alphabets

In [8]:
df_documents['text'] = df_documents['text'].str.replace("[^a-zA-Z]", " ")
df_documents.head()

Unnamed: 0,label,text
0,10,Well I will have to change the scoring on my ...
1,1,Archive name graphics resources list part La...
2,10,And of course Mike Ramsey was at one time ...
3,10,As I promised I would give you the name of th...
4,10,GAME S OF ADIRONDACK C...


#### Tokenize words

In [9]:
tokenized_doc = df_documents['text'].apply(lambda x: x.split())

#### Remove stopwords

In [10]:
stop_words = stopwords.words('english')

In [11]:
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

#### De-tokenize words

In [12]:
detokenized_doc = []
for i in range(len(df_documents)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

In [13]:
df_documents['text'] = detokenized_doc
df_documents.head()

Unnamed: 0,label,text
0,10,Well I change scoring playoff pool Unfortunate...
1,1,Archive name graphics resources list part Last...
2,10,And course Mike Ramsey one time captain Buffal...
3,10,As I promised I would give name Panther presid...
4,10,GAME S OF ADIRONDACK CDI Adirondack leads seri...


## Split Datasets

In [14]:
df_train, df_valid = train_test_split(df_documents, stratify=df_documents['label'], test_size=0.4, random_state=9)

In [15]:
print(f'Train size: {df_train.shape[0]} text')
print(f'Valid size: {df_valid.shape[0]} text')

Train size: 710 text
Valid size: 474 text


In [34]:
df_train.isnull()

label    0
text     0
dtype: int64

## Separate Datasets for Language & Classification Model Using [fast.ai](https://docs.fast.ai/text.html)

In [None]:
data_lm = TextLMDataBunch.from_df(train_df=df_train, valid_df=df_valid, path='')

data_class = TextClasDataBunch.from_df(train_df=df_train, valid_df=df_valid, vocab=data_lm.train_ds.vocab, bs=32, path='')

## Create Pre-trained Language Model

In [None]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)

In [None]:
learn.fit_one_cycle(1, 1e-2)

In [None]:
learn.lr_find()
learn.recorder.plot()

## Save Language Model As The Encoder

In [None]:
learn.save_encoder('ft_enc')

## Create Pre-trained Classifier Model

In [None]:
learn = text_classifier_learner(data_class, drop_mult=0.7)

In [None]:
learn.load_encoder('ft_enc')

In [None]:
learn.fit_one_cycle(1, 1e-2)

In [None]:
learn.lr_find()
learn.recorder.plot()

## Get Predictions

In [None]:
preds, targets = learn.get_preds()
predictions = np.argmax(preds, axis=1)
pd.crosstab(predictions, targets)

---