<a href="https://colab.research.google.com/github/parvathysarat/pretrained-language-models/blob/master/ULMFiT_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

In [None]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io, os

## Importing Dataset library

In [None]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})
# For demonstration of classification task, we only consider 2 out of 10 groups for now.
df = df[df['label'].isin([1,10])]
df.reset_index(drop = True, inplace=True)
df['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

### Preprocessing

In [None]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")
tokenized_doc = df['text'].apply(lambda x: x.split())
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)


### Preparing data for the language model and for the classification model separately

In [None]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

## Fine-tuning pre-trained LM and saving encoder

In [None]:
learn = language_model_learner(data_lm, arch = AWD_LSTM, pretrained=True, drop_mult=0.7)
# train the learner object with learning rate = 0.01
learn.fit_one_cycle(1, 1e-2)
learn.save_encoder('ft_enc')


epoch,train_loss,valid_loss,accuracy,time
0,5.017286,4.353529,0.250614,00:13


Accuracy in this case is equal to probability of predicting the next word correctly from a given vocab in a number of Bernoulli trials equal to the number or words of the corpus. 

## Build classifier with fine-tuned encoder, making predictions

In [None]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)
learn.load_encoder('ft_enc')
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.415381,0.248114,0.915612,00:41


In [None]:
# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,226,32
1,8,208
