In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *
import html
import dill as pickle

import pdb
from collections import Counter, defaultdict

import spacy
spacy_en = spacy.load('en')

# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)

In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [4]:
def texts_labels_from_folders(path, classes):
    texts,labels = [],[]
    for idx,label in enumerate(classes):
        for fname in (path/label).glob('*.*'):
            texts.append(fname.open('r').read())
            labels.append(idx)
            
    return np.array(texts),np.array(labels)

In [5]:
re1 = re.compile(r'  +')

def clean_text(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [6]:
def process_example(row, txt_cols, lbl_cols=[], lbl_dtype=np.int64):
    n_txt_cols = len(txt_cols)
    n_label_cols = len(lbl_cols)
    
    labels = row[lbl_cols].values.astype(lbl_dtype) if (n_label_cols > 0) else []
    
    docs = f'\n{BOS} {FLD} 1 ' + row[txt_cols[0]].astype(str)
    for i, col in enumerate(txt_cols[1:]):
        docs += f' {FLD} {i+2} ' + row[col].astype(str)

    docs = docs.apply(clean_text).values.astype(str)
    tok = Tokenizer().proc_all_mp(partition_by_cores(docs))
    
    return tok, list(labels)

In [7]:
def process_examples(df, txt_cols, lbl_cols=[], lbl_dtype=np.int64):
    tok, labels = [], []
    
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = process_example(r, txt_cols, lbl_cols, lbl_dtype)
        tok += tok_
        labels += labels_
        
    return tok, labels

In [8]:
class Vocab:
    def __init__(self, tokens, min_freq=1, max_size=None, 
                 specials=['_unk_', '_pad_', '_bos_', '_eos_'], unk_idx=0):
        
        self.min_freq = max(min_freq, 1)
        self.specials = specials
        self.unk_idx = unk_idx
        
        self.tokens = list(specials)
        self.max_size = None if max_size is None else max_size + len(self.tokens)
        
        self.token_freqs = Counter(tokens)
        for t in self.specials: del self.token_freqs[t]
            
        self.tokens = [ t for t, c in self.token_freqs.most_common(self.max_size) if c > min_freq ]
        
        #itos
        self.itos = self.tokens
        
        #stoi
        self.stoi = collections.defaultdict(lambda: self.unk_idx, { tok:i for i, tok in enumerate(self.tokens) })
        
    def get_unk_idx(self):
        return self.unk_idx
    
    def token_freq(self, token):
        return self.token_freqs.get(token, 0)

In [9]:
class LanguageDataset(torch.utils.data.Dataset):
    def __init__(self, docs, vocab=None, min_freq=1, max_size=None):
        self.tokens = []
        for d in docs: self.tokens += d
        
        if (vocab):
            self.vocab = vocab
        else:
            self.vocab = Vocab(self.tokens, min_freq, max_size)
        
        self.data = np.array([[ self.vocab.stoi[t] for t in self.tokens ]])
        
    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)

## IMDB - Multi-classification problem

### Language Model

In [None]:
PATH= Path('data/aclImdb')
TRN_PATH = PATH/'train'
VAL_PATH = PATH/'test'

LM_PATH = PATH/'imdb_lm'
CLS_PATH = PATH/'imdb_class'

(LM_PATH/'models').mkdir(parents=True, exist_ok=True)
(LM_PATH/'tmp').mkdir(exist_ok=True)

(CLS_PATH/'models').mkdir(parents=True, exist_ok=True)
(CLS_PATH/'tmp').mkdir(exist_ok=True)

# [child for child in PATH.iterdir()]

In [None]:
%ls {str(PATH)}

Get list of documents and labels

In [None]:
CLASSES = ['neg', 'pos', 'unsup']

In [None]:
trn_docs, trn_labels = texts_labels_from_folders(TRN_PATH, CLASSES)
val_docs, val_labels = texts_labels_from_folders(VAL_PATH, CLASSES)

In [None]:
len(trn_docs), len(val_docs), len(trn_labels[trn_labels == 1]), len(trn_labels[trn_labels == 0])

Randomize ordering of everythig

In [None]:
np.random.seed(42)
trn_idxs = np.random.permutation(len(trn_docs))
val_idxs = np.random.permutation(len(val_docs))

In [None]:
trn_docs = trn_docs[trn_idxs]
trn_labels = trn_labels[trn_idxs]

val_docs = val_docs[val_idxs]
val_labels = val_labels[val_idxs]

Put documents and labels into dataframes

In [None]:
col_names = ['labels', 'text']

In [None]:
trn_df = pd.DataFrame({'text':trn_docs, 'labels':trn_labels}, columns=col_names)
val_df = pd.DataFrame({'text':val_docs, 'labels':val_labels}, columns=col_names)

In [None]:
display(trn_df.head(1))
display(trn_df.tail(1))

In [None]:
trn_df[trn_df['labels'] != 2].to_csv(CLS_PATH/'train.csv', index=False)
val_df.to_csv(CLS_PATH/'test.csv', index=False)

(CLS_PATH/'classes.txt').open('w').writelines(f'{c}\n' for c in CLASSES)

For a language model, we want to use the entire corpus.  In the case of IMDB, only 50k of the 100k documents are labeled and so we look to the `train/all` and `test/all` folders to graby all 100k

In [None]:
trn_docs, val_docs = sklearn.model_selection.train_test_split(
    np.concatenate([trn_docs, val_docs]), test_size=0.1)

In [None]:
len(trn_docs), len(val_docs)

In [None]:
trn_df = pd.DataFrame({'text':trn_docs}, columns=col_names[1:])
val_df = pd.DataFrame({'text':val_docs}, columns=col_names[1:])

trn_df.to_csv(LM_PATH/'train.csv', index=False)
val_df.to_csv(LM_PATH/'test.csv', index=False)

In [None]:
trn_df.head(2)

Clean and tokenize the text data

In [None]:
chunksize=24000

In [None]:
trn_df = pd.read_csv(LM_PATH/'train.csv', chunksize=chunksize)
val_df = pd.read_csv(LM_PATH/'test.csv', chunksize=chunksize)

In [None]:
trn_toks, _ = process_examples(trn_df, col_names[1:])
val_toks, _ = process_examples(val_df, col_names[1:])

In [None]:
len(trn_toks), len(val_toks)

In [None]:
print(trn_toks[0][:15])

In [None]:
np.save(LM_PATH/'tmp'/'trn_toks.npy', trn_toks)
np.save(LM_PATH/'tmp'/'val_toks.npy', val_toks)

In [None]:
trn_toks = np.load(LM_PATH/'tmp'/'trn_toks.npy')
val_toks = np.load(LM_PATH/'tmp'/'val_toks.npy')

Define the vocab and build the datasets

In [None]:
max_vocab = 60000
min_freq = 2

In [None]:
%time trn_ds = LanguageDataset(np.concatenate((trn_toks, val_toks)), min_freq=min_freq, max_size=max_vocab)
%time val_ds = LanguageDataset(val_toks, vocab=trn_ds.vocab)

In [None]:
print(len(trn_ds[0]), len(trn_ds.tokens), len(trn_ds.vocab.tokens), len(trn_ds))
print(len(val_ds[0]), len(val_ds.tokens), len(val_ds.vocab.tokens), len(val_ds))

In [None]:
print(trn_ds[0][:10])
print([ trn_ds.vocab.itos[idx] for idx in trn_ds[0][:10] ])

In [None]:
%time pickle.dump(trn_ds, open(LM_PATH/'tmp'/'trn_ds.pkl', 'wb'))
%time pickle.dump(val_ds, open(LM_PATH/'tmp'/'val_ds.pkl', 'wb'))
%time pickle.dump(trn_ds.vocab, open(LM_PATH/'tmp'/'vocab.pkl', 'wb'))

In [None]:
%%time
trn_ds = pickle.load(open(LM_PATH/'tmp'/'trn_ds.pkl', 'rb'))
val_ds = pickle.load(open(LM_PATH/'tmp'/'val_ds.pkl', 'rb'))
vocab = pickle.load(open(LM_PATH/'tmp'/'vocab.pkl', 'rb'))

Configure and build the data loaders

In [None]:
bptt = 70
bsz = 52
wd = 1e-7
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [None]:
trn_dl = LanguageModelLoader(trn_ds[0], bsz, bptt)
val_dl = LanguageModelLoader(val_ds[0], bsz, bptt)

md = LanguageModelData(PATH, 1, len(vocab.tokens), trn_dl, val_dl, bs=bsz, bptt=bptt)

In [None]:
len(md.trn_dl), md.n_tok, len(trn_ds), len(trn_ds.tokens)

In [None]:
batch = next(iter(md.trn_dl))
print(batch[0].size()), print(batch[1].size())

### Text Model

In [None]:
trn_df = pd.read_csv(CLS_PATH/'train.csv', chunksize=chunksize)
val_df = pd.read_csv(CLS_PATH/'test.csv', chunksize=chunksize)

In [None]:
trn_toks, trn_labels = process_examples(trn_df, col_names[1:], col_names[:1])
val_toks, val_labels = process_examples(val_df, col_names[1:], col_names[:1])

In [None]:
np.save(CLS_PATH/'tmp'/'trn_toks.npy', trn_toks)
np.save(CLS_PATH/'tmp'/'val_toks.npy', val_toks)

np.save(CLS_PATH/'tmp'/'trn_labels.npy', trn_labels)
np.save(CLS_PATH/'tmp'/'val_labels.npy', val_labels)

In [None]:
trn_toks = np.load(CLS_PATH/'tmp'/'trn_toks.npy')
val_toks = np.load(CLS_PATH/'tmp'/'val_toks.npy')

trn_labels = np.load(CLS_PATH/'tmp'/'trn_labels.npy')
val_labels = np.load(CLS_PATH/'tmp'/'val_labels.npy')

In [None]:
freq = Counter(p for o in trn_toks for p in o)
freq.most_common(10)

In [None]:
%time vocab = pickle.load(open(LM_PATH/'tmp'/'vocab.pkl', 'rb'))

In [None]:
trn_nums = np.array([[vocab.stoi[o] for o in p] for p in trn_toks])
val_nums = np.array([[vocab.stoi[o] for o in p] for p in val_toks])

In [None]:
print(trn_nums[0])

In [None]:
np.save(CLS_PATH/'tmp'/'trn_nums.npy', trn_nums)
np.save(CLS_PATH/'tmp'/'val_nums.npy', val_nums)

In [None]:
trn_nums = np.load(CLS_PATH/'tmp'/'trn_nums.npy')
val_nums = np.load(CLS_PATH/'tmp'/'val_nums.npy')

In [None]:
trn_labels = np.squeeze(np.load(CLS_PATH/'tmp'/'trn_labels.npy'))
val_labels = np.squeeze(np.load(CLS_PATH/'tmp'/'val_labels.npy'))

In [None]:
trn_labels.shape, val_labels.shape

Make sure labels are zero-indexed

In [None]:
min_lbl = trn_labels.min()
trn_labels -= min_lbl
val_labels -= min_lbl
c=int(trn_labels.max())+1

In [None]:
bptt, em_sz, nh, nl = 70,400,1150,3
bsz = 48
vs = len(vocab.tokens)
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [None]:
trn_ds = TextDataset(trn_nums, trn_labels)
val_ds = TextDataset(val_nums, val_labels)

trn_samp = SortishSampler(trn_nums, key=lambda x: len(trn_nums[x]), bs=bsz//2)
val_samp = SortSampler(val_nums, key=lambda x: len(val_nums[x]))

trn_dl = DataLoader(trn_ds, bsz//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bsz, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)

md = ModelData(PATH, trn_dl, val_dl)

In [None]:
len(trn_ds), trn_dl.batch_size, len(trn_dl), len(trn_dl.dataset), len(trn_ds[0][0]), trn_ds[0][1]

In [None]:
x, y = next(iter(trn_dl))

In [None]:
print(x.size(), x.type(), y.size(), y.type(), bsz)

In [None]:
# x, y

## TOXIC- Multi-label problem

In [10]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

In [11]:
PATH= Path('data/toxic-comment')

(PATH/'models').mkdir(parents=True, exist_ok=True)
(PATH/'tmp').mkdir(exist_ok=True)

# [child for child in PATH.iterdir()]

In [12]:
raw_train_df = pd.read_csv(PATH/'train.csv')
test_df = pd.read_csv(PATH/'test.csv')
sample_subm_df = pd.read_csv(PATH/'sample_submission.csv')

txt_col = 'comment_text'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

model_cols = ['id', txt_col] + label_cols + ['none']

Define training and validation datasets

In [13]:
# split the training data into a train and validatin dataset
trn, val = train_test_split(raw_train_df, test_size=0.05, random_state=9)
print(len(trn), len(val), len(trn[trn.none != 1]), len(val[val.none != 1]))

# save train, val, and test datasets for torchtext
trn[model_cols].to_csv(PATH/'train_ds.csv', index=None)
val[model_cols].to_csv(PATH/'valid_ds.csv', index=None)

# save full cleaned datasets (train+valid and test) as well
raw_train_df[model_cols].to_csv(PATH/'full_train_ds.csv', index=None)
test_df[['id', txt_col]].to_csv(PATH/'test_ds.csv', index=None)

91058 4793 9288 502


In [14]:
display(pd.read_csv(PATH/"full_train_ds.csv").head(2))
display(pd.read_csv(PATH/"test_ds.csv").head(2))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,22256635,"Nonsense? kiss off, geek. what I said is true. I'll have your account terminated.",1,0,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you did with this edit to W. S. Merwin. If you continue to do so, you will be blocked from editing. """,0,0,0,0,0,0,1


Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. SS500 .jpg)==
1,6102620,"::Kentuckiana is colloquial. Even though the area is often referred to as this, it (in my opinion) has never held the encyclopedic precision of ""Louisville metropolitian area"", which has a specific U.S. Census definition. Also, apparently Kentuckiana often refers to the local television viewing area, which isn't nearly contiguous with the official metro area. As you indicate, Kentuckiana seems to be more of a slang or marketing phenomena than anything we could pin down in encyclopedic terms here. That's why we see Wikipedia language like ""the Louisville metropolitan area, sometimes referred to as Kentuckiana"". That's my take on it. — •"


Clean and tokenize documents

In [15]:
max_features = 100000 #30000
min_freq = 10 #0
max_len = 175 #100

In [16]:
chunksize = 24000

In [17]:
trn_df = pd.read_csv(PATH/'train_ds.csv', chunksize=chunksize)
val_df = pd.read_csv(PATH/'valid_ds.csv', chunksize=chunksize)
test_df = pd.read_csv(PATH/'test_ds.csv', chunksize=chunksize)

In [18]:
trn_toks, trn_labels = process_examples(trn_df, [txt_col], label_cols, lbl_dtype=np.float32)
val_toks, val_labels = process_examples(val_df, [txt_col], label_cols, lbl_dtype=np.float32)

0
1
2
3
0


In [19]:
len(trn_toks), len(val_toks), trn_labels[0].shape, val_labels[0].shape

(91058, 4793, (6,), (6,))

In [20]:
freq = Counter(p for o in trn_toks for p in o)
freq.most_common(10)

[('.', 297273),
 ('the', 283109),
 (',', 269182),
 ('"', 214611),
 ('to', 169607),
 ('\n', 140276),
 ('i', 137007),
 ('of', 128229),
 ('and', 127863),
 ('you', 125563)]

Build vocab, fix lengths of each document, and numericalize

In [21]:
all_toks = []
for toks in np.concatenate((trn_toks, val_toks)): all_toks += toks

In [22]:
vocab = Vocab(all_toks, min_freq, max_features)

In [23]:
trn_toks = [ d[:max_len] + ['_pad_']*(max_len-len(d)) for d in trn_toks ]
val_toks = [ d[:max_len] + ['_pad_']*(max_len-len(d)) for d in val_toks ]

In [24]:
# np.save(PATH/'tmp'/'trn_toks.npy', trn_toks)
# np.save(PATH/'tmp'/'val_toks.npy', val_toks)

np.save(PATH/'tmp'/'trn_labels.npy', trn_labels)
np.save(PATH/'tmp'/'val_labels.npy', val_labels)

In [25]:
# trn_toks = np.load(PATH/'tmp'/'trn_toks.npy')
# val_toks = np.load(PATH/'tmp'/'val_toks.npy')

trn_labels = np.load(PATH/'tmp'/'trn_labels.npy')
val_labels = np.load(PATH/'tmp'/'val_labels.npy')

In [26]:
trn_nums = np.array([ [vocab.stoi[o] for o in p] for p in trn_toks ])
val_nums = np.array([ [vocab.stoi[o] for o in p] for p in val_toks ])

In [27]:
np.save(PATH/'tmp'/'trn_nums.npy', trn_nums)
np.save(PATH/'tmp'/'val_nums.npy', val_nums)
pickle.dump(vocab, open(PATH/'tmp'/'vocab.pkl', 'wb'))

In [28]:
len(trn_toks), len(val_toks), len(trn_toks[0])

(91058, 4793, 175)

In [29]:
(np.squeeze(trn_labels)).shape, (np.squeeze(val_labels)).shape

((91058, 6), (4793, 6))

In [30]:
trn_nums = np.load(PATH/'tmp'/'trn_nums.npy')
val_nums = np.load(PATH/'tmp'/'val_nums.npy')
vocab = pickle.load(open(PATH/'tmp'/'vocab.pkl', 'rb'))

In [31]:
trn_labels = np.squeeze(np.load(PATH/'tmp'/'trn_labels.npy'))
val_labels = np.squeeze(np.load(PATH/'tmp'/'val_labels.npy'))

Build datasets and data loaders

In [32]:
bsz = 64
pretrained_vectors = None #'fasttext.en.300d'

In [33]:
trn_ds = TextDataset(trn_nums, trn_labels)
val_ds = TextDataset(val_nums, val_labels)

trn_samp = SortishSampler(trn_nums, key=lambda x: len(trn_nums[x]), bs=bsz//2)
val_samp = SortSampler(val_nums, key=lambda x: len(val_nums[x]))

trn_dl = DataLoader(trn_ds, bsz//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bsz, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)

md = ModelData(PATH, trn_dl, val_dl)

In [34]:
len(trn_ds), trn_dl.batch_size, len(trn_dl), len(trn_dl.dataset), len(trn_ds[0][0]), trn_ds[0][1]

(91058, 32, 2846, 91058, 175, array([0., 0., 0., 0., 0., 0.], dtype=float32))

In [35]:
x, y = next(iter(trn_dl))

In [36]:
print(x.size(), x.type(), y.size(), y.type(), bsz)

torch.Size([175, 32]) torch.LongTensor torch.Size([32, 6]) torch.FloatTensor 64


In [37]:
# x, y

Let's try this against a simple LSTM

In [38]:
class SimpleLstm(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 dropout_rnn=0.3, dropout_after_emb=0.4, dropout_emb=0.1, wdrop=0.05):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropout_emb = dropout_emb
        self.dropout_after_emb = LockedDropout(dropout_after_emb)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
#         self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.LSTM(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropout_rnn)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
      
        self.outp = nn.Linear(n_rnn_hidden * 2 * self.n_dirs, out_sz)
        
        # initialize weights
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropout_emb if self.training else 0)
        x = self.dropout_after_emb(x)
        
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
        
        sl, bs, _ = output.size()
  
#         pdb.set_trace()
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), 1).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), 1).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        outp = F.sigmoid(self.outp(x))
        
        return outp

    def init_hidden(self, bsz):
        self.hidden = (V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)),
                       V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)))

In [39]:
vocab_sz = len(vocab.tokens)
emb_sz = 300
out_sz = 6

n_rnn_hidden = 128
n_rnn_layers = 1
bi_dir = True

model = SimpleLstm(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=bsz)
model#.cuda()

SimpleLstm(
  (dropout_after_emb): LockedDropout(
  )
  (emb): Embedding(18777, 300)
  (emb_with_drop): EmbeddingDropout(
    (embed): Embedding(18777, 300)
  )
  (rnn): WeightDrop(
    (module): LSTM(300, 128, dropout=0.3, bidirectional=True)
  )
  (outp): Linear(in_features=512, out_features=6, bias=True)
)

In [40]:
lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)

In [None]:
fit(model, md, 1, lo.opt, F.binary_cross_entropy)

In [None]:
fit(model, md, 1, lo.opt, F.binary_cross_entropy)