# Source Code

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Preparing fastai storage locations

In [4]:
model_path = Path("/tf/data/models/")

## Language model Pretraining

Note that language models can use a lot of GPU, so you may need to decrease batchsize here.

In [None]:
bs=128

Now let's grab the full dataset for what follows.

In [None]:
path = Path("/tf/data/datasets/raw/raw_java/data00m_god-r")
path.ls()

In [None]:
data_lm = (TextList.from_folder(path, extensions={".java"},
                                processor = [OpenFileProcessor(),
                                             SPProcessor(lang="en")])
           #Inputs: all the text files in path
            .filter_by_folder(include=['sm_train', 'sm_valid', 'sm_test']) 
#            #We may have other temp folders that contain text files so we only keep what's in train and test
            .split_by_folder(valid='sm_valid', train='sm_train')
           #We randomly split and keep 10% (10,000 reviews) for validation
            .label_for_lm()           
           #We want to do a language model so we label accordingly
            .databunch(bs=bs))
data_lm.save('data_lm.pkl')

We have to use a special kind of `TextDataBunch` for the language model, that ignores the labels (that's why we put 0 everywhere), will shuffle the texts at each epoch before concatenating them all together (only for training, we don't shuffle for the validation set) and will send batches that read that text in order with targets that are the next word in the sentence.

The line before being a bit long, we want to load quickly the final ids by using the following cell.

In [None]:
data_lm = load_data(path, 'data_lm.pkl', bs=bs)

In [None]:
data_lm.show_batch(), len(data_lm.train_ds), len(data_lm.valid_ds)

We can then put this in a learner object very easily with a model loaded with the pretrained weights. They'll be downloaded the first time you'll execute the following line and stored in `~/.fastai/models/` (or elsewhere if you specified different paths in your config file).

In [None]:
learn = language_model_learner(data_lm, TransformerXL, drop_mult=0.3)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
# Set hyperparameters
max_lr = 5e-2
moms = (0.5, 0.75)
pct_strt = 0.02

In [None]:
learn.fit_one_cycle(1, max_lr, moms=moms, pct_start = pct_strt)

In [None]:
learn.fit_one_cycle(5, max_lr, moms=moms, pct_start = pct_strt)

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.save(model_path, 'fit_head')

In [None]:
learn.load(model_path, 'fit_head');

To complete the fine-tuning, we can then unfeeze and launch a new training.

In [None]:
learn.unfreeze(-2)

In [None]:
# learn.fit_one_cycle(10, 5e-4, moms=(0.8,0.7), pct_start = 0.02)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

In [None]:
learn.save(model_path, 'first')

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

In [None]:
learn.save('third')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

In [None]:
learn.save(model_path, 'fine_tuned')

In [None]:
! curl -X POST -H 'Content-type: application/json' --data '{"text":"from: semeru tower 2\nstatus: finished training TransformerXL"}' https://hooks.slack.com/services/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

In [None]:
learn.recorder.plot_metrics()

How good is our model? Well let's try to see what it predicts after a few given words.

In [None]:
learn.load('fine_tuned');

In [None]:
TEXT = "public String get"
N_WORDS = 40
N_SENTENCES = 2

In [None]:
print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

We have to save not only the model, but also its encoder, the part that's responsible for creating and updating the hidden state. For the next part, we don't care about the part that tries to guess the next word.

In [None]:
learn.save_encoder('fine_tuned_enc')

## Classifier

Now, we'll create a new data object that only grabs the labelled data and keeps those labels. Again, this line takes a bit of time.

In [None]:
path = untar_data(URLs.IMDB)

In [None]:
data_clas = (TextList.from_folder(path, vocab=data_lm.vocab, extensions={".java"},
                                processor = [OpenFileProcessor(),
                                             SPProcessor(lang="en")])
             .filter_by_folder(include=['sm_train', 'sm_valid'])
             #grab all the text files in path
             .split_by_folder(train='sm_train', valid='sm_valid')
             #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
             .label_from_folder(classes=['before', 'after'])
             #label them all with their folders
             .databunch(bs=bs))

data_clas.save('data_clas.pkl')

In [10]:
path = Path("/tf/data/datasets/raw/raw_java/data00m_god-r")
path.ls()

[PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/data_clas.pkl'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/data_lm.pkl'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/tmp'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_test'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_train'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/train'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/models'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/tmp.sh'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/valid'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_valid'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/rename.sh'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/test')]

In [5]:
bs = 64

In [None]:
ItemList??

In [None]:
data_clas = TextClasDataBunch.from_csv(path, 'security-training.csv',
                                       text_cols = 'code', label_cols = 'label')

In [None]:
data_clas.save('data_clas.pkl')

In [11]:
data_clas = load_data(path, 'data_clas.pkl', bs=bs)

In [12]:
data_clas.show_batch()

text,target
"▁xxbos ▁@ java . lang . suppress warnings ( value ▁= ▁"" unchecked "") ▁private ▁void ▁initcomponents () ▁{ ▁ jframe help dialog ▁= ▁new ▁javax . swing . jframe (); ▁jpanel help dialog ▁= ▁new ▁javax . swing . jpanel (); ▁jscrollpane 8 ▁= ▁new ▁javax . swing . jscrollpane (); ▁ jtextarea help file ▁= ▁new ▁javax . swing . jtextarea (); ▁jpanel 1 ▁= ▁new ▁javax .",before
"▁xxbos ▁private ▁static ▁void ▁init () ▁{ ▁ timezone mapper . poly [ 18 00 ] ▁= ▁new ▁ timezone mapper . tz polygon ( 35 . 499 57 3 f , ▁xxup ▁61 . 2 590 2 f , ▁xxup ▁35 . 52 172 f , ▁xxup ▁61 . 27 1 64 f , ▁xxup ▁35 . 61 3 167 f , ▁xxup ▁61 . 276 638 f ,",after
"▁xxbos ▁@ java . lang . override ▁public ▁boolean ▁oncommand ( final ▁org . bukkit . command . commandsender ▁sender , ▁final ▁org . bukkit . command . command ▁command , ▁final ▁java . lang . string ▁label , ▁final ▁java . lang . string [] ▁split ) ▁{ ▁if ▁(!( sender ▁instanceof ▁org . bukkit . entity . player )) ▁{ ▁return ▁false ; ▁} ▁final ▁org . bukkit",after
"▁xxbos ▁@ java . lang . override ▁public ▁java . lang . string ▁ getdestination ( java . lang . string ▁function , ▁com . strate lia . webactiv . kmelia . control . kmelia session controller ▁ kmelia , ▁org . silverpeas . servlet . httprequest ▁request ) ▁{ ▁com . strate lia . silverpeas . silver trace . silver trace . info ("" kmelia "", ▁"" kmelia request",before
"▁xxbos ▁@ java . lang . override ▁protected ▁void ▁build ns d _ r ( lu . fisch . structorizer . parsers . reduction ▁_ reduction , ▁ lu . fisch . structorizer . elements . sub queue ▁_ parentnode ) ▁{ ▁if ▁(( _ reduction . size ()) ▁> ▁0) ▁{ ▁java . lang . string ▁rule ▁= ▁_ reduction . getparent (). tostring (); ▁java . lang .",before


In [13]:
len(data_clas.train_ds)

50000

We can then create a model to classify those reviews and load the encoder we saved before.

In [14]:
learn = text_classifier_learner(data_clas, TransformerXL, drop_mult=0.5)
learn.load_encoder('fine_tuned_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (50000 items)
x: TextList
▁xxbos ▁public ▁void ▁return results ( java . util . arraylist < com . example . pi tur . new track om m ender . trackinfo > ▁query results ) ▁{ ▁if ▁( query results ▁!= ▁null ) ▁{ ▁this . query results ▁= ▁query results ; ▁adapter ▁= ▁new ▁com . example . pi tur . new track om m ender . trackinfo adapter ( this , ▁xxmaj ▁r . layout . list _ item , ▁query results ); ▁set listadapter ( adapter ); ▁set item clicklistener s ( this . get listview ()); ▁} else ▁{ ▁android . widget . toast . maketext ( this , ▁" your ▁query ▁g ave ▁no ▁results . ", ▁toast . length _ long ) . show (); ▁} ▁},▁xxbos ▁public ▁static ▁synchronized ▁jsonobject ▁ handling exchange ( java . util . arraylist < java . lang . string > ▁server list , ▁java . util . arraylist < java . lang . string > ▁server list _ exchange , ▁java . util . arraylist < java . lang . string > ▁hostname list _ exchange , ▁java . util . arraylist < java . lang . 

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(5, 2e-2, moms=(0.8,0.7))

In [None]:
learn.save('first')

In [15]:
learn.load('first');

In [16]:
learn.freeze_to(-2)


In [17]:
learn.lr_find()

RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 23.65 GiB total capacity; 20.35 GiB already allocated; 30.56 MiB free; 1.33 GiB cached)

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

In [None]:
learn.save('second')

In [None]:
learn.load('second');

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

In [None]:
learn.save('third')

In [None]:
learn.load('third');

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

In [None]:
learn.predict("I really loved that movie, it was awesome!")