# Vietnamese ULMFiT from scratch

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *
torch.cuda.set_device(1)

In [2]:
bs=48
# bs=24
#bs=128

In [3]:
torch.cuda.set_device(1)

In [4]:
data_path = Config.data_path()

This will create a `viwiki` folder, containing a `viwiki` text file with the wikipedia contents. (For other languages, replace `vi` with the appropriate code from the [list of wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).)

In [5]:
lang = 'vi'
# lang = 'zh'

In [6]:
name = f'{lang}wiki'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']

## Vietnamese wikipedia model

### Download data

In [23]:
get_wiki??

In [26]:
!ls -alh /home/molly/.fastai/data/viwiki/

total 6.6G
drwxrwxr-x 1 molly molly  188 Jul 21 03:37 .
drwxrwxr-x 1 molly molly 2.1K Jul 21 01:57 ..
drwxrwxr-x 1 molly molly 4.2M Jul 21 03:43 docs
-rw-rw-r-- 1 molly molly    0 Jul 21 03:20 log
-rw-rw-r-- 1 molly molly 690M Jul 21 03:37 viwiki
-rw-rw-r-- 1 molly molly 5.2G Jul 21 02:03 viwiki-latest-pages-articles.xml
-rw-rw-r-- 1 molly molly 771M Jul 21 02:00 viwiki-latest-pages-articles.xml.bz2
drwxrwxr-x 1 molly molly  166 Jul 21 03:20 wikiextractor


In [24]:
path.ls()

[PosixPath('/home/molly/.fastai/data/viwiki/viwiki-latest-pages-articles.xml.bz2'),
 PosixPath('/home/molly/.fastai/data/viwiki/viwiki-latest-pages-articles.xml'),
 PosixPath('/home/molly/.fastai/data/viwiki/wikiextractor'),
 PosixPath('/home/molly/.fastai/data/viwiki/log'),
 PosixPath('/home/molly/.fastai/data/viwiki/viwiki'),
 PosixPath('/home/molly/.fastai/data/viwiki/docs')]

In [9]:
from nlputils import split_wiki,get_wiki

In [10]:
torch.version.cuda

'10.1'

In [11]:
get_wiki(path,lang)

/home/molly/.fastai/data/viwiki/viwiki already exists; not downloading


In [12]:
path.ls()

[PosixPath('/home/molly/.fastai/data/viwiki/viwiki-latest-pages-articles.xml.bz2'),
 PosixPath('/home/molly/.fastai/data/viwiki/viwiki-latest-pages-articles.xml'),
 PosixPath('/home/molly/.fastai/data/viwiki/wikiextractor'),
 PosixPath('/home/molly/.fastai/data/viwiki/log'),
 PosixPath('/home/molly/.fastai/data/viwiki/viwiki'),
 PosixPath('/home/molly/.fastai/data/viwiki/docs')]

In [13]:
!head -n4 {path}/{name}

<doc id="13" url="https://vi.wikipedia.org/wiki?curid=13" title="Tiếng Việt">
Tiếng Việt

Tiếng Việt (chữ Nôm: 㗂越), cũng gọi là tiếng Việt Nam (㗂越南), tiếng Kinh (㗂京) hay Việt ngữ (chữ Hán: 越語) là ngôn ngữ của người Việt và là ngôn ngữ chính thức tại Việt Nam. Đây là tiếng mẹ đẻ của khoảng 85% dân cư Việt Nam cùng với hơn 4 triệu Việt kiều. Tiếng Việt còn là ngôn ngữ thứ hai của các dân tộc thiểu số tại Việt Nam và là ngôn ngữ dân tộc thiểu số tại Cộng hòa Séc.


This function splits the single wikipedia file into a separate file per article. This is often easier to work with.

In [14]:
dest = split_wiki(path,lang)

/home/molly/.fastai/data/viwiki/docs already exists; not splitting


In [15]:
dest.ls()[:5]

[PosixPath('/home/molly/.fastai/data/viwiki/docs/Tiếng Việt.txt'),
 PosixPath('/home/molly/.fastai/data/viwiki/docs/Ohio.txt'),
 PosixPath('/home/molly/.fastai/data/viwiki/docs/California.txt'),
 PosixPath('/home/molly/.fastai/data/viwiki/docs/Thụy Điển.txt'),
 PosixPath('/home/molly/.fastai/data/viwiki/docs/Thành phố Hồ Chí Minh.txt')]

In [16]:
# Use this to convert Chinese traditional to simplified characters
# ls *.txt | parallel -I% opencc -i % -o ../zhsdocs/% -c t2s.json

### Create pretrained model

In [17]:
data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=bs, num_workers=1))

data.save(f'{lang}_databunch')
len(data.vocab.itos),len(data.train_ds)

(60000, 77902)

In [18]:
path.ls()

[PosixPath('/home/molly/.fastai/data/viwiki/viwiki-latest-pages-articles.xml.bz2'),
 PosixPath('/home/molly/.fastai/data/viwiki/viwiki-latest-pages-articles.xml'),
 PosixPath('/home/molly/.fastai/data/viwiki/wikiextractor'),
 PosixPath('/home/molly/.fastai/data/viwiki/log'),
 PosixPath('/home/molly/.fastai/data/viwiki/viwiki'),
 PosixPath('/home/molly/.fastai/data/viwiki/docs')]

In [19]:
#data = load_data(path, f'{lang}_databunch', bs=bs)

In [20]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [21]:
lr = 1e-2
lr *= bs/48  # Scale learning rate by batch size

In [22]:
learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.390507,3.413331,0.381884,1:12:54
1,3.528426,3.47103,0.373764,1:12:54
2,3.466977,3.466276,0.374455,1:12:54
3,3.404703,3.43546,0.377115,1:12:22
4,3.444111,3.392356,0.381717,1:12:33
5,3.397098,3.332469,0.389263,1:12:39
6,3.329429,3.248087,0.398962,1:12:36
7,3.254677,3.156741,0.411036,1:12:28
8,3.089688,3.074162,0.421645,1:12:42


KeyboardInterrupt: 

Save the pretrained model and vocab:

In [None]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

## Vietnamese sentiment analysis

### Language model

- [Data](https://github.com/ngxbac/aivivn_phanloaisacthaibinhluan/tree/master/data)
- [Competition details](https://www.aivivn.com/contests/1)
- Top 3 f1 scores: 0.900, 0.897, 0.897

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.loc[pd.isna(train_df.comment),'comment']='NA'
train_df.head()

In [None]:
test_df = pd.read_csv(path/'test.csv')
test_df.loc[pd.isna(test_df.comment),'comment']='NA'
test_df.head()

In [None]:
df = pd.concat([train_df,test_df], sort=False)

In [None]:
data_lm = (TextList.from_df(df, path, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()           
    .databunch(bs=bs, num_workers=1))

In [None]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0)

In [None]:
lr = 1e-3
lr *= bs/48

In [None]:
learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7))

In [None]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7))

In [None]:
learn_lm.save(f'{lang}fine_tuned')
learn_lm.save_encoder(f'{lang}fine_tuned_enc')

### Classifier

In [None]:
data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1))

data_clas.save(f'{lang}_textlist_class')

In [None]:
#data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)

In [None]:
from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [None]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc')
learn_c.freeze()

In [None]:
lr=2e-2
lr *= bs/48

In [None]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

In [None]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

In [None]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

In [None]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

In [None]:
learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

In [None]:
learn_c.save(f'{lang}clas')

Competition top 3 f1 scores: 0.90, 0.89, 0.89. Winner used an ensemble of 4 models: TextCNN, VDCNN, HARNN, and SARNN.

## Ensemble

In [None]:
#data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load(f'{lang}clas', purge=False);

In [None]:
preds,targs = learn_c.get_preds(ordered=True)
accuracy(preds,targs),f1(preds,targs)

In [None]:
#data_clas_bwd = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True)
learn_c_bwd = text_classifier_learner(data_clas_bwd, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c_bwd.load(f'{lang}clas_bwd', purge=False);

In [None]:
preds_b,targs_b = learn_c_bwd.get_preds(ordered=True)
accuracy(preds_b,targs_b),f1(preds_b,targs_b)

In [None]:
preds_avg = (preds+preds_b)/2

In [None]:
accuracy(preds_avg,targs_b),f1(preds_avg,targs_b)