## IMDb

At Fast.ai we have introduced a new module called fastai.text which replaces the torchtext library that was used in our 2018 dl1 course. The fastai.text module also supersedes the fastai.nlp library but retains many of the key functions.

In [1]:
from fastai.text import *
from fastai import *
import html
import sklearn.model_selection

# The Fastai.text module introduces several custom tokens.

We need to download the IMDB large movie reviews from this site: http://ai.stanford.edu/~amaas/data/sentiment/
Direct link : [Link](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) and untar it into the PATH location. We use pathlib which makes directory traveral a breeze.

In [2]:
DATA_PATH=Path('/data/aclImdb')
DATA_PATH.mkdir(exist_ok=True)

In [None]:
!curl -O http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 
    

In [None]:
!tar -xzf aclImdb_v1.tar.gz -C {DATA_PATH}

In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH=Path('/data/aclImdb/aclImdb/')
CLAS_PATH=Path('/data/aclImdb/imdb_clas/')
CLAS_PATH.mkdir(exist_ok=True)

LM_PATH=Path('/data/aclImdb/imdb_lm/')
LM_PATH.mkdir(exist_ok=True)

## Standardize format

The imdb dataset has 3 classes. positive, negative and unsupervised(sentiment is unknown). 
There are 75k training reviews(12.5k pos, 12.5k neg, 50k unsup)
There are 25k validation reviews(12.5k pos, 12.5k neg & no unsup)

Refer to the README file in the imdb corpus for further information about the dataset.

In [6]:
CLASSES = ['neg', 'pos', 'unsup']

def get_texts(path):
    texts,labels = [],[]
    for idx,label in enumerate(CLASSES):
        for fname in (path/label).glob('*.*'):
            texts.append(fname.open('r', encoding='utf-8').read())
            labels.append(idx)
    return np.array(texts),np.array(labels)
trn_texts,trn_labels = get_texts(PATH/'train')
val_texts,val_labels = get_texts(PATH/'test')

In [7]:
len(trn_texts),len(val_texts)

(75000, 25000)

In [10]:
c = np.concatenate([trn_texts,val_texts])


In [12]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(c, test_size=0.1)

In [13]:
len(trn_texts), len(val_texts)


(90000, 10000)

In [16]:
col_names = ['labels','text']


In [8]:
np.random.seed(42)
trn_idx = np.random.permutation(len(trn_texts))
val_idx = np.random.permutation(len(val_texts))

In [9]:
trn_texts = trn_texts[trn_idx]
val_texts = val_texts[val_idx]

trn_labels = trn_labels[trn_idx]
val_labels = val_labels[val_idx]

In [10]:
df_trn = pd.DataFrame({'text':trn_texts, 'labels':trn_labels}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts, 'labels':val_labels}, columns=col_names)

In [11]:
df_trn[df_trn['labels']!=2].to_csv(CLAS_PATH/'train.csv', header=False, index=False)
df_val.to_csv(CLAS_PATH/'test.csv', header=False, index=False)

(CLAS_PATH/'classes.txt').open('w', encoding='utf-8').writelines(f'{o}\n' for o in CLASSES)

In [None]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(
    np.concatenate([trn_texts,val_texts]), test_size=0.1)

In [17]:
df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)}, columns=col_names)

df_trn.to_csv(LM_PATH/'train.csv', header=False, index=False)
df_val.to_csv(LM_PATH/'test.csv', header=False, index=False)

# Language model tokens

In [18]:
chunksize=24000


In [21]:
re1  =  re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [38]:
def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = list(texts.apply(fixup).values)

    tok = Tokenizer().process_all(texts)
    return tok, list(labels)

In [39]:
def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [40]:
df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'test.csv', header=None, chunksize=chunksize)

In [41]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

0
1
2
3
0


In [42]:
(LM_PATH/'tmp').mkdir(exist_ok=True)


In [43]:
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [44]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [45]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

[('the', 1208705),
 ('.', 993432),
 (',', 986386),
 ('and', 587843),
 ('a', 584387),
 ('of', 524939),
 ('to', 485913),
 ('is', 393599),
 ('it', 341989),
 ('in', 337877),
 ('i', 308544),
 ('this', 270905),
 ('that', 261620),
 ('"', 236786),
 ("'s", 221873),
 ('-', 188238),
 ('was', 180380),
 ('\n\n', 179210),
 ('as', 166434),
 ('with', 159366),
 ('for', 158902),
 ('movie', 157971),
 ('but', 150466),
 ('film', 144229),
 ('you', 124580)]

In [46]:
max_vocab = 60000
min_freq = 2

In [47]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [48]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [49]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [50]:
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))

In [5]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: '/data/aclImdb/imdb_lm/tmp/trn_ids.npy'

In [14]:
vs=len(itos)
vs,len(trn_lm)

(60002, 90000)

# WT 103 Dl

In [11]:
def download_wt103_model():
    model_path = PATH/'models'
    os.makedirs(model_path, exist_ok=True)
    download_url('http://files.fast.ai/models/wt103_v1/lstm_wt103.pth', model_path/'lstm_wt103.pth')
    download_url('http://files.fast.ai/models/wt103_v1/itos_wt103.pkl', model_path/'itos_wt103.pkl')

In [None]:
download_wt103_model()

# Language Model

In [4]:
wd=1e-7
bptt=70
bs=14
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [5]:
data_lm = text_data_from_csv(Path(LM_PATH), data_func=lm_data, valid='test', bs=bs)


In [6]:
learn = RNNLearner.language_model(data_lm, pretrained_fnames=['lstm_wt103', 'itos_wt103'])

In [7]:
learn.metrics= [accuracy]
learn.unfreeze()
learn.fit(2, slice(1e-4,1e-2))

VBox(children=(HBox(children=(IntProgress(value=0, max=2), HTML(value='0.00% [0/2 00:00<00:00]'))), HTML(value…

Total time: 11:22:51
epoch  train loss  valid loss  accuracy
0      4.767652    4.575449    0.261003  (5:40:52)
1      4.674298    4.506974    0.268821  (5:41:59)



In [8]:
learn.save('fwd_lm_4.507')
learn.save_encoder('fl_lm_4.507.enc')

In [11]:
from torch import tensor as T

In [129]:
[data_lm.train_ds.vocab.stoi[x] for x in "the meaning of life is the".split()]

[2, 1187, 7, 135, 9]

In [134]:
inp = T([[2, 1187, 7, 135, 9, 2]])
print(inp.size())
print(inp.t().size())


res= learn.model.forward(inp.t().cuda())[0][-1]
print(res.argmax(), res.size())

torch.Size([1, 6])
torch.Size([6, 1])
tensor(115, device='cuda:0') torch.Size([60002])


In [133]:
data_lm.train_ds.vocab.itos[15]

'that'

In [54]:
edp = learn.model._modules['0']._modules['encoder_dp']
learn.model.train(False)

SequentialRNN(
  (0): RNNCore(
    (encoder): Embedding(60002, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60002, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60002, bias=True)
    (output_dp): RNNDropout()
  )
)

In [None]:
learn.model._modules['0']._modules['encoder_dp'].__call__

In [None]:
learn.model._modules['0']._modules['encoder_dp'].__call__

In [None]:
learn.model._modules['0']._modules['encoder_dp'].__call__