## IMDb

In [1]:
from fastai.text import *
import html

In [2]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH=Path('data/aclImdb/')

## Standardize format

In [3]:
CLASSES = ['neg', 'pos']

def get_texts(path):
    texts,labels = [],[]
    for idx,label in enumerate(CLASSES):
        for fname in (path/label).glob('*.*'):
            texts.append(fname.open('r').read())
            labels.append(idx)
    return texts,labels

trn_texts,trn_labels = get_texts(PATH/'train')
val_texts,val_labels = get_texts(PATH/'test')

In [4]:
len(trn_texts),len(val_texts)

(25000, 25000)

In [5]:
col_names = ['labels','text']

In [6]:
CLAS_PATH=Path('data/imdb_clas/')
CLAS_PATH.mkdir(exist_ok=True)

In [7]:
df_trn = pd.DataFrame({'text':trn_texts, 'labels':trn_labels}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts, 'labels':val_labels}, columns=col_names)

df_trn.to_csv(CLAS_PATH/'train.csv', header=False, index=False)
df_val.to_csv(CLAS_PATH/'test.csv', header=False, index=False)

(CLAS_PATH/'classes.txt').open('w').writelines(f'{o}\n' for o in CLASSES)

In [8]:
display(df_trn.head())
display(df_val.tail())

Unnamed: 0,labels,text
0,0,Story of a man who has unnatural feelings for ...
1,0,Airport '77 starts as a brand new luxury 747 p...
2,0,This film lacked something I couldn't put my f...
3,0,"Sorry everyone,,, I know this is supposed to b..."
4,0,When I was little my parents took me along to ...


Unnamed: 0,labels,text
24995,1,I was extraordinarily impressed by this film. ...
24996,1,"Although I'm not a golf fan, I attended a snea..."
24997,1,"From the start of ""The Edge Of Love"", the view..."
24998,1,"This movie, with all its complexity and subtle..."
24999,1,I've seen this story before but my kids haven'...


In [9]:
def get_texts(path):
    return [fname.open('r').read() for fname in (path/'all').glob('*.*')]

all_texts =  get_texts(PATH/'train')
all_texts += get_texts(PATH/'test')

In [10]:
len(all_texts)

100000

In [11]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(all_texts, test_size=0.1)

In [12]:
len(trn_texts), len(val_texts)

(90000, 10000)

In [13]:
LM_PATH=Path('data/imdb_lm/')
LM_PATH.mkdir(exist_ok=True)

In [14]:
df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)}, columns=col_names)

df_trn.to_csv(LM_PATH/'train.csv', header=False, index=False)
df_val.to_csv(LM_PATH/'test.csv', header=False, index=False)

In [15]:
[0] * 10

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [16]:
df_trn.tail()

Unnamed: 0,labels,text
89995,0,"I would like to say i have seen Many film's , ..."
89996,0,After his father was killed a young man named ...
89997,0,This is a nice piece of work. Very sexy and en...
89998,0,"This is, without any doubt, the WORST movie in..."
89999,0,I got to see this just this last Friday at the...


## Language model tokens

In [17]:
chunksize=24000

import pdb

In [18]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [19]:
def get_texts(df, n_lbls=1):
#     pdb.set_trace()
    
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)

    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

In [20]:
def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [21]:
df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'test.csv', header=None, chunksize=chunksize)

In [22]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

0
1
2
3
0


In [23]:
len(tok_trn), len(trn_labels)

(90000, 90000)

In [24]:
tok_trn[0]

['\n',
 'xbos',
 'xfld',
 '1',
 'sophisticated',
 'sex',
 'comedies',
 'are',
 'always',
 'difficult',
 'to',
 'pull',
 'off',
 '.',
 'look',
 'at',
 'the',
 'films',
 'of',
 'blake',
 'edwards',
 ',',
 'who',
 'is',
 'arguably',
 'the',
 'master',
 'of',
 'the',
 'genre',
 ',',
 'and',
 'you',
 'will',
 'find',
 'just',
 'as',
 'many',
 'misses',
 'as',
 'hits',
 '.',
 'for',
 ',',
 'if',
 'a',
 'film',
 'of',
 'this',
 'nature',
 'ever',
 'fails',
 'to',
 'work',
 ',',
 'it',
 'can',
 'never',
 'fall',
 'back',
 'on',
 'the',
 'tried',
 'and',
 'true',
 'toilet',
 'humor',
 'of',
 'a',
 'teen',
 'sex',
 'comedy',
 '[',
 'i.e.',
 '"',
 'american',
 'pie',
 '"',
 ']',
 ',',
 'or',
 'warm',
 'the',
 'audience',
 'with',
 'the',
 'sentimentality',
 'of',
 'a',
 'romantic',
 'comedy',
 '[',
 'i.e.',
 'julia',
 'roberts',
 "'",
 'entire',
 'career',
 ']',
 '.',
 'it',
 'can',
 'only',
 'maintain',
 'a',
 'push',
 'to',
 'the',
 'end',
 ',',
 'and',
 'hope',
 'that',
 'the',
 'audience',
 '

In [25]:
(LM_PATH/'tmp').mkdir(exist_ok=True)

In [26]:
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [27]:
# trn_joined = [' '.join(o) for o in tok_trn]
# mdl_fn = f'{PATH}tmp/{pr_abbr}_joined.txt'
# open(mdl_fn, 'w', encoding='utf-8').writelines(trn_joined)

In [28]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [29]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

[('the', 1208033),
 ('.', 991897),
 (',', 984093),
 ('and', 587619),
 ('a', 583275),
 ('of', 524479),
 ('to', 485475),
 ('is', 393321),
 ('it', 341241),
 ('in', 337750),
 ('i', 307977),
 ('this', 270649),
 ('that', 260925),
 ('"', 236575),
 ("'s", 221012),
 ('-', 187782),
 ('was', 180429),
 ('\n\n', 178930),
 ('as', 165752),
 ('with', 159247),
 ('for', 158772),
 ('movie', 157584),
 ('but', 150310),
 ('film', 144079),
 ('you', 124187)]

In [30]:
max_vocab = 60000
min_freq = 2

In [31]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [32]:
itos[:10]

['_unk_', '_pad_', 'the', '.', ',', 'and', 'a', 'of', 'to', 'is']

In [33]:
stoi['freddy']

2808

In [34]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [35]:
len(trn_lm), len(val_lm)

(90000, 10000)

In [36]:
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))

## Language model

In [37]:
wd=1e-7
bptt=70
bs=52
em_sz,nh,nl = 400,1150,3
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [38]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
trn_lm = np.concatenate(trn_lm)
val_lm = np.concatenate(val_lm)

itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))
vs = len(itos)

trn_dl = LanguageModelLoader(trn_lm, bs, bptt)
val_dl = LanguageModelLoader(val_lm, bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [None]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*1.

In [None]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.unfreeze()

In [None]:
learner.lr_find(start_lr=1e-6,end_lr=1e12)

In [None]:
learner.sched.plot()

In [None]:
lr=2e-3

In [None]:
learner.fit(lr, 1, wds=wd, use_clr=(32,5), cycle_len=5)

In [None]:
learner.fit(lr, 1, wds=wd, use_clr=(32,10), cycle_len=10)

In [None]:
learner.save('lm')
learner.save_encoder('lm_enc')

In [None]:
learner.fit(lr/2, 1, wds=wd, use_clr=(32,10), cycle_len=20)

## Classifier tokens

In [39]:
df_trn = pd.read_csv(CLAS_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(CLAS_PATH/'test.csv', header=None, chunksize=chunksize)

In [40]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

0
1
0
1


In [41]:
(CLAS_PATH/'tmp').mkdir(exist_ok=True)

np.save(CLAS_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(CLAS_PATH/'tmp'/'tok_val.npy', tok_val)

np.save(CLAS_PATH/'tmp'/'trn_labels.npy', trn_labels)
np.save(CLAS_PATH/'tmp'/'val_labels.npy', val_labels)

In [42]:
tok_trn = np.load(CLAS_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(CLAS_PATH/'tmp'/'tok_val.npy')

trn_labels = np.load(CLAS_PATH/'tmp'/'trn_labels.npy')
val_labels = np.load(CLAS_PATH/'tmp'/'val_labels.npy')

In [43]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

[('the', 335844),
 ('.', 277583),
 (',', 275297),
 ('and', 163775),
 ('a', 162489),
 ('of', 145813),
 ('to', 135629),
 ('is', 110387),
 ('it', 95826),
 ('in', 93847),
 ('i', 86730),
 ('this', 75735),
 ('that', 73495),
 ('"', 65053),
 ("'s", 62103),
 ('-', 52852),
 ('was', 50493),
 ('\n\n', 49832),
 ('as', 46849),
 ('for', 44290),
 ('with', 44076),
 ('movie', 43840),
 ('but', 42441),
 ('film', 40027),
 (')', 34632)]

In [44]:
itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [45]:
trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])
val_clas = np.array([[stoi[o] for o in p] for p in tok_val])

In [46]:
trn_clas[0]

[40,
 41,
 42,
 39,
 81,
 7,
 6,
 144,
 48,
 60,
 7154,
 1427,
 22,
 6,
 4469,
 3,
 529,
 59,
 21,
 6,
 650,
 149,
 14,
 9,
 6,
 1351,
 510,
 7,
 1895,
 222,
 3,
 6,
 11601,
 7125,
 327,
 9,
 674,
 103,
 47,
 2018,
 4,
 1074,
 2492,
 46,
 2,
 952,
 0,
 7,
 10,
 16,
 6073,
 3,
 492,
 10,
 2890,
 1895,
 2,
 31,
 240,
 74,
 21,
 73,
 768,
 1406,
 868,
 253,
 10,
 55,
 116,
 141,
 1513,
 3,
 75,
 164,
 50,
 2,
 962,
 153,
 37,
 674,
 141,
 3,
 2,
 13038,
 428,
 72,
 111,
 2301,
 329,
 745,
 8,
 6,
 836,
 13203,
 3,
 28,
 6,
 1983,
 649,
 10,
 16,
 145,
 92,
 26,
 251,
 121,
 21,
 64,
 66,
 671,
 46,
 711,
 100,
 38817,
 40892,
 3,
 711,
 423,
 3747,
 21302,
 5,
 11400,
 6329,
 77,
 37,
 130,
 3371,
 3]

In [47]:
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_clas)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_clas)

## Classifier

In [48]:
trn_clas = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_clas = np.load(LM_PATH/'tmp'/'val_ids.npy')

In [49]:
bptt,em_sz,nh,nl = 70,400,1150,3
vs = len(itos)
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
bs = 48

In [50]:
trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'trn_labels.npy'))
val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'val_labels.npy'))

In [51]:
trn_labels[-10:], val_labels[10:]

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), array([0, 0, 0, ..., 1, 1, 1]))

In [52]:
trn_labels.min(), trn_labels.max()

(0, 1)

In [53]:
trn_labels -= trn_labels.min()
val_labels -= val_labels.min()
c=int(trn_labels.max())+1

In [54]:
trn_ds = TextDataset(trn_clas, trn_labels)
val_ds = TextDataset(val_clas, val_labels)
trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)
val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))
trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)
md = ModelData(PATH, trn_dl, val_dl)

In [55]:
len(trn_ds), trn_dl.batch_size, len(trn_dl), len(trn_dl.dataset), len(trn_ds[0][0]), trn_ds[0][1]

(25000, 24, 1042, 25000, 128, 0)

In [56]:
x, y = next(iter(trn_dl))

In [57]:
x.size(), y.size(), bs

(torch.Size([688, 24]), torch.Size([24]), 48)

In [58]:
x, y

(
      1      1      1  ...       1      1     40
      1      1      1  ...       1     40     41
      1      1      1  ...       1     41     42
         ...            ⋱           ...         
   1215      9     93  ...     223     38   8729
   1146    367    145  ...     124    149      4
      3      3      3  ...     183      3   3213
 [torch.LongTensor of size 688x24], 
  0
  0
  1
  0
  0
  1
  1
  1
  0
  0
  0
  0
  0
  1
  0
  1
  1
  0
  1
  1
  1
  0
  1
  1
 [torch.LongTensor of size 24])

In [None]:
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5

In [None]:
m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [None]:
learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learn.clip=25.
learn.metrics = [accuracy]

In [None]:
lr=3e-3
lrm = 2.6
lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])

In [None]:
wd = 1e-6
learn.load_encoder('lm_enc')

In [None]:
learn.freeze_to(-1)

In [None]:
learn.lr_find(lrs/1000)
learn.sched.plot()

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))

In [None]:
learn.save('clas_0')

In [None]:
learn.load('clas_0')

In [None]:
learn.unfreeze()

In [None]:
lr=2e-3
lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=20, use_clr=(32,10))

In [None]:
learn.save('clas_1')