In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *
import html

import pdb
from collections import Counter, defaultdict

import spacy
spacy_en = spacy.load('en')

# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)

In [3]:
BOS = '<bos>'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [4]:
re1 = re.compile(r'  +')

def clean_text(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [5]:
def process_example(row, txt_cols, lbl_cols=[], lbl_dtype=np.int64):
    n_txt_cols = len(txt_cols)
    n_label_cols = len(lbl_cols)
    
    labels = row[lbl_cols].values.astype(lbl_dtype) if (n_label_cols > 0) else []
    
    docs = f'\n{BOS} {FLD} 1 ' + row[txt_cols[0]].astype(str)
    for i, col in enumerate(lbl_cols[1:]):
        docs += f' {FLD} {i+ 1} ' + row[col].astype(str)

    docs = docs.apply(clean_text).values.astype(str)
    tok = Tokenizer().proc_all_mp(partition_by_cores(docs))
    
    return tok, list(labels)

In [6]:
def process_examples(df, txt_cols, lbl_cols=[], lbl_dtype=np.int64):
    tok, labels = [], []
    
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = process_example(r, txt_cols, lbl_cols, lbl_dtype)
        tok += tok_
        labels += labels_
        
    return tok, labels

In [7]:
class Vocab:
    def __init__(self, tokens, min_freq=1, max_size=None, 
                 specials=['<unk>', '<pad>', '<bos>', '<eos>'], unk_idx=0):
        
        self.min_freq = max(min_freq, 1)
        self.specials = specials
        self.unk_idx = unk_idx
        
        self.tokens = list(specials)
        self.max_size = None if max_size is None else max_size + len(self.tokens)
        
        self.token_freqs = Counter(tokens)
        for t in self.specials: del self.token_freqs[t]
            
        # sort by frequency, then alphabetically
        self.token_freqs = sorted(self.token_freqs.items(), key=lambda tf: tf[0])
        self.token_freqs.sort(key=lambda tf: tf[1], reverse=True)
        
        for token, freq in self.token_freqs:
            if freq < self.min_freq or len(self.tokens) == self.max_size:
                break
            self.tokens.append(token)
 
        #itos
        self.itos = self.tokens
        
        #stoi
        stoi = defaultdict(lambda x: self.unk_idx) # default is <unk>
        stoi.update({ tok: i for i, tok in enumerate(self.tokens) })
        self.stoi = defaultdict(self.unk_token_idx, stoi)
        
    def unk_token_idx(self):
        return self.unk_idx
    
    def token_freq(self, token):
        return self.token_freqs.get(token, 0)

In [8]:
class LanguageDataset(torch.utils.data.Dataset):
    def __init__(self, docs, vocab=None, min_freq=1, max_size=None):
        self.tokens = []
        for d in docs: self.tokens += d
        
        if (vocab):
            self.vocab = vocab
        else:
            self.vocab = Vocab(self.tokens, min_freq, max_size)
        
        self.data = np.array([[ self.vocab.stoi[t] for t in self.tokens ]])
        
    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)

## IMDB - Multi-classification problem

### Language Model

In [9]:
PATH= Path('data/aclImdb')
TRN_PATH = PATH/'train'
VAL_PATH = PATH/'test'

LM_PATH = PATH/'imdb_lm'
CLS_PATH = PATH/'imdb_class'

(LM_PATH/'models').mkdir(parents=True, exist_ok=True)
(LM_PATH/'tmp').mkdir(exist_ok=True)

(CLS_PATH/'models').mkdir(parents=True, exist_ok=True)
(CLS_PATH/'tmp').mkdir(exist_ok=True)

# [child for child in PATH.iterdir()]

In [10]:
%ls {str(PATH)}

README      imdbEr.txt  [34mimdb_lm[m[m/    [34mtrain[m[m/
imdb.vocab  [34mimdb_class[m[m/ [34mtest[m[m/


Get list of documents and labels

In [11]:
trn_docs, trn_labels = texts_labels_from_folders(TRN_PATH, ['neg', 'pos'])
val_docs, val_labels = texts_labels_from_folders(VAL_PATH, ['neg', 'pos'])

In [12]:
len(trn_docs), len(val_docs), len(trn_labels[trn_labels == 1]), len(trn_labels[trn_labels == 0])

(25000, 25000, 12500, 12500)

Put documents and labels into dataframes

In [13]:
col_names = ['review_text', 'label']

In [14]:
trn_df = pd.DataFrame({'review_text':trn_docs, 'label':trn_labels}, columns=col_names)
val_df = pd.DataFrame({'review_text':val_docs, 'label':val_labels}, columns=col_names)

In [15]:
display(trn_df.head(1))
display(trn_df.tail(1))

Unnamed: 0,review_text,label
0,"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",0


Unnamed: 0,review_text,label
24999,"Working-class romantic drama from director Martin Ritt is as unbelievable as they come, yet there are moments of pleasure due mostly to the charisma of stars Jane Fonda and Robert De Niro (both terrific). She's a widow who can't move on, he's illiterate and a closet-inventor--you can probably guess the rest. Adaptation of Pat Barker's novel ""Union Street"" (a better title!) is so laid-back it verges on bland, and the film's editing is a mess, but it's still pleasant; a rosy-hued blue-collar fantasy. There are no overtures to serious issues (even the illiteracy angle is just a plot-tool for the ensuing love story) and no real fireworks, though the characters are intentionally a bit colorless and the leads are toned down to an interesting degree. The finale is pure fluff--and cynics will find it difficult to swallow--though these two characters deserve a happy ending and the picture wouldn't really be satisfying any other way. *** from ****",1


In [16]:
trn_df.to_csv(CLS_PATH/'train.csv', index=False)
val_df.to_csv(CLS_PATH/'test.csv', index=False)

For a language model, we want to use the entire corpus.  In the case of IMDB, only 50k of the 100k documents are labeled and so we look to the `train/all` and `test/all` folders to graby all 100k

In [17]:
def get_all_docs(path):
    return [fname.open('r').read() for fname in (path/'all').glob('*.*')]

all_docs = get_all_docs(TRN_PATH)
all_docs += get_all_docs(VAL_PATH)

In [18]:
len(all_docs)

100000

In [19]:
trn_docs, val_docs = sklearn.model_selection.train_test_split(all_docs, test_size=0.1)

In [20]:
len(trn_docs), len(val_docs)

(90000, 10000)

In [21]:
trn_all_df = pd.DataFrame({'review_text':trn_docs}, columns=[col_names[0]])
val_all_df = pd.DataFrame({'review_text':val_docs}, columns=[col_names[0]])

trn_all_df.to_csv(LM_PATH/'train_all.csv', index=False)
val_all_df.to_csv(LM_PATH/'test_all.csv', index=False)

In [22]:
trn_all_df.head(2)

Unnamed: 0,review_text
0,"This is another film I had missed out on a number of times on Cable TV in the past. It's considered something of a censorship milestone with the treatment of taboo subjects such as prostitution, homosexuality and pornography  not to mention the proliferation of bad language throughout (unfortunately, the DVD is said to contain the slightly edited PG-rated version, which cuts some brief nudity involving female lead Barbra Streisand and her use of the f-word in one scene)! <br /><br />With this in mind, one has to consider the development which the comedy genre underwent during this time: from the mildly risqué sophisticated antics of the Doris Day/Rock Hudson films of the early 1960s to the cynical anxiety-ridden variety that started emanating towards the tail-end of the decade  with which the likes of Jack Lemmon, George Segal (the male lead of this film) and, in particular, Woody Allen (since he was his own writer and mostly directed himself as well) are forever associated.<br /><br />THE OWL AND THE PUSSYCAT is also notable for giving the current female singing sensation  Barbra Streisand  her first non-musical role; in fact, it led to other wacky comedy vehicles: foremost among them WHAT'S UP, DOC? (1972; Peter Bogdanovich's updating of the Howard Hawks classic BRINGING UP BABY [1938]) and FOR PETE'S SAKE (1974; whose trailer, included on the Columbia R2 DVD of the film under review, makes it seem like a good deal of fun). Thanks largely to his role in the film, Segal went on to do his fair share of sex comedies up till the early 1980s  with the most successful among them being A TOUCH OF CLASS (1973), which I should be acquiring shortly.<br /><br />Anyway, to get to the main item: the film can be seen as a modern variation on the perennial ""Pygmalion"" theme  with Segal as intellectual but, at the same time, neurotic and Streisand the uncouth yet liberated woman. There's no plot to speak of  instead, we follow the two stars on a logical pattern of location-hopping around New York throughout which their relationship blossoms: from his apartment when she's evicted because of his snitching (which leads to both of them being given the gate by the landlord), to them shacking up at the flat of Segal's pal (who drives them out because of their constant bickering), then going their separate ways till they meet again (after he has learned about her movie experience  a hilarious scene  and a 'colleague' of hers has gone to see him at his workplace) and go out together (where they're harassed by a band of thrill-seekers), after which they find themselves at the house of Segal's fiancée (a scene with an unexpectedly ironic punchline), to finally deciding to be completely honest with one another (beginning with their real names).<br /><br />In this respect, the film emerges to be overly talky (betraying its stage origins) but there is a reasonable amount of invention and wit in the undeniable comedy highlights: Segal dressing up as Death to scare the hiccupping Streisand; Segal using an aquarium as a TV set  with him delivering an impromptu news flash  to humor the insomniac Streisand (her addiction to TV is illustrated by a surprising reference to the Lionel Atwill/Lon Chaney Jr. horror pic MAN MADE MONSTER [1941]); the couple's argument over ""the sun spat morning"" line in the opening paragraph of a book by aspiring novelist Segal; Streisand's account of the sordid activities her clients invariably came up with (prompting Segal to describe her as ""a sexual Disneyland""), etc. The film's soundtrack is highlighted by several songs from jazz/rock band Blood, Sweat & Tears."
1,"'Nemesis' was the last book to feature Miss Marple written by Agatha Christie (the official final case 'Sleeping Murder' was written in the forties) and I've always had a very soft spot for it. I loved the characters and they are lovingly brought to life in this excellent BBC adaptation with Joan Hickson, terrific as ever, as Miss Marple.<br /><br />On the whole it is very faithful to the book. A few characters are dropped, the first (new) murder is slightly different and a couple of new characters are introduced. Personally I felt that the added character of Lionel Peel was unnecessary and rather irritating. Tour guide Madge was irritating in a different way but often quite amusing. It's largely because of Lionel that I don't award 10 out of 10! The other characters are beautifully done especially Helen Cherry as a dignified Miss Temple and all of the three weird sisters but particularly Margaret Tyzack who gives a towering performance as Clothilde. She threatens to go over the top towards the end but just avoids it. The female bodyguards are good value too and the episode contains one of my favourite Hickson lines...'An Archdeacon?!' <br /><br />This is another relatively early BBC Marple that looks wonderful and is has a gloriously nostalgic feel to it. Highly recommended."


Clean and tokenize the text data

In [23]:
chunksize=24000

In [24]:
trn_all_df = pd.read_csv(LM_PATH/'train_all.csv', chunksize=chunksize)
val_all_df = pd.read_csv(LM_PATH/'test_all.csv', chunksize=chunksize)

In [25]:
trn_toks, _ = process_examples(trn_all_df, [col_names[0]])
val_toks, _ = process_examples(val_all_df, [col_names[0]])

0
1
2
3
0


In [26]:
len(trn_toks), len(val_toks)

(90000, 10000)

In [27]:
print(trn_toks[0][:15])

['\n', '<bos>', 'xfld', '1', 'this', 'is', 'another', 'film', 'i', 'had', 'missed', 'out', 'on', 'a', 'number']


In [28]:
np.save(LM_PATH/'tmp'/'tok_trn.npy', trn_toks)
np.save(LM_PATH/'tmp'/'tok_val.npy', val_toks)

In [29]:
trn_toks = np.load(LM_PATH/'tmp'/'tok_trn.npy')
val_toks = np.load(LM_PATH/'tmp'/'tok_val.npy')

Define the vocab and build the datasets

In [30]:
max_vocab = 60000
min_freq = 2

In [31]:
%time trn_ds = LanguageDataset(np.concatenate((trn_toks, val_toks)), min_freq=min_freq, max_size=max_vocab)
%time val_ds = LanguageDataset(val_toks, vocab=trn_ds.vocab)

CPU times: user 9.74 s, sys: 392 ms, total: 10.1 s
Wall time: 10.1 s
CPU times: user 577 ms, sys: 32.6 ms, total: 609 ms
Wall time: 609 ms


In [32]:
print(len(trn_ds[0]), len(trn_ds.tokens), len(trn_ds.vocab.tokens), len(trn_ds))
print(len(val_ds[0]), len(val_ds.tokens), len(val_ds.vocab.tokens), len(val_ds))

27787632 27787632 60004 1
2795406 2795406 60004 1


In [33]:
print(trn_ds[0][:10])
print([ trn_ds.vocab.itos[idx] for idx in trn_ds[0][:10] ])

[ 42   2  43  41  15  11 175  27  14  85]
['\n', '<bos>', 'xfld', '1', 'this', 'is', 'another', 'film', 'i', 'had']


Configure and build the data loaders

In [34]:
bptt = 70
bsz = 52

In [35]:
trn_dl = LanguageModelLoader(trn_ds[0], bsz, bptt)
val_dl = LanguageModelLoader(val_ds[0], bsz, bptt)

md = LanguageModelData(PATH, 1, len(trn_ds.vocab.tokens), trn_dl, val_dl, bs=bsz, bptt=bptt)

In [36]:
len(md.trn_dl), md.nt, len(trn_ds), len(trn_ds.tokens)

(7632, 60004, 1, 27787632)

In [37]:
batch = next(iter(md.trn_dl))
print(batch[0].size()), print(batch[1].size())

torch.Size([69, 52])
torch.Size([3588])


(None, None)

### Text Model

In [38]:
trn_df = pd.read_csv(CLS_PATH/'train.csv', chunksize=chunksize)
val_df = pd.read_csv(CLS_PATH/'test.csv', chunksize=chunksize)

In [39]:
trn_toks, trn_labels = process_examples(trn_df, [col_names[0]], [col_names[1]])
val_toks, val_labels = process_examples(val_df, [col_names[0]], [col_names[1]])

0
1
0
1


In [40]:
np.save(CLS_PATH/'tmp'/'tok_trn.npy', trn_toks)
np.save(CLS_PATH/'tmp'/'tok_val.npy', val_toks)

np.save(CLS_PATH/'tmp'/'trn_labels.npy', trn_labels)
np.save(CLS_PATH/'tmp'/'val_labels.npy', val_labels)

In [41]:
trn_toks = np.load(CLS_PATH/'tmp'/'tok_trn.npy')
val_toks = np.load(CLS_PATH/'tmp'/'tok_val.npy')

trn_labels = np.load(CLS_PATH/'tmp'/'trn_labels.npy')
val_labels = np.load(CLS_PATH/'tmp'/'val_labels.npy')

In [42]:
freq = Counter(p for o in trn_toks for p in o)
freq.most_common(10)

[('the', 335844),
 ('.', 277583),
 (',', 275297),
 ('and', 163775),
 ('a', 162489),
 ('of', 145813),
 ('to', 135629),
 ('is', 110387),
 ('it', 95826),
 ('in', 93847)]

In [43]:
# itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))
# stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
# len(itos)

In [44]:
vocab = trn_ds.vocab

trn_clas = np.array([[vocab.stoi[o] for o in p] for p in trn_toks])
val_clas = np.array([[vocab.stoi[o] for o in p] for p in val_toks])

In [45]:
print(trn_clas[0])

[42, 2, 43, 41, 82, 9, 8, 145, 49, 60, 7108, 1428, 24, 8, 4388, 5, 529, 61, 23, 8, 651, 150, 16, 11, 8, 1359, 513, 9, 1904, 224, 5, 8, 11301, 7170, 329, 11, 663, 103, 48, 2028, 6, 1065, 2499, 47, 4, 955, 0, 9, 12, 18, 5991, 5, 496, 12, 2937, 1904, 4, 33, 240, 75, 23, 74, 774, 1411, 871, 255, 12, 56, 117, 141, 1509, 5, 76, 165, 51, 4, 970, 154, 39, 663, 141, 5, 4, 12894, 427, 73, 114, 2307, 328, 752, 10, 8, 845, 12432, 5, 30, 8, 1965, 648, 12, 18, 146, 93, 28, 253, 122, 23, 65, 67, 665, 47, 713, 102, 39512, 41578, 5, 713, 428, 3597, 22380, 7, 11229, 6384, 78, 39, 131, 3368, 5]


In [46]:
trn_labels = np.squeeze(trn_labels)
val_labels = np.squeeze(trn_labels)

In [47]:
trn_labels.shape, val_labels.shape

((25000,), (25000,))

In [48]:
trn_labels -= trn_labels.min()
val_labels -= val_labels.min()
c=int(trn_labels.max()) + 1

In [49]:
trn_ds = TextDataset(trn_clas, trn_labels)
val_ds = TextDataset(val_clas, val_labels)

trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bsz//2)
val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))

trn_dl = DataLoader(trn_ds, bsz//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bsz, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)

md = ModelData(PATH, trn_dl, val_dl)

In [50]:
len(trn_ds), trn_dl.batch_size, len(trn_dl), len(trn_dl.dataset), len(trn_ds[0][0]), trn_ds[0][1]

(25000, 26, 962, 25000, 128, 0)

In [51]:
x, y = next(iter(trn_dl))

In [52]:
print(x.size(), x.type(), y.size(), y.type(), bsz)

torch.Size([259, 26]) torch.LongTensor torch.Size([26]) torch.LongTensor 52


In [53]:
# x, y

## TOXIC- Multi-label problem

In [54]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

In [55]:
PATH= Path('data/toxic-comment')

(PATH/'models').mkdir(parents=True, exist_ok=True)
(PATH/'tmp').mkdir(exist_ok=True)

# [child for child in PATH.iterdir()]

In [56]:
raw_train_df = pd.read_csv(PATH/'train.csv')
test_df = pd.read_csv(PATH/'test.csv')
sample_subm_df = pd.read_csv(PATH/'sample_submission.csv')

txt_col = 'comment_text'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

model_cols = ['id', txt_col] + label_cols + ['none']

Define training and validation datasets

In [57]:
# split the training data into a train and validatin dataset
trn, val = train_test_split(raw_train_df, test_size=0.05, random_state=9)
print(len(trn), len(val), len(trn[trn.none != 1]), len(val[val.none != 1]))

# save train, val, and test datasets for torchtext
trn[model_cols].to_csv(PATH/'train_ds.csv', index=None)
val[model_cols].to_csv(PATH/'valid_ds.csv', index=None)

# save full cleaned datasets (train+valid and test) as well
raw_train_df[model_cols].to_csv(PATH/'full_train_ds.csv', index=None)
test_df[['id', txt_col]].to_csv(PATH/'test_ds.csv', index=None)

91058 4793 9288 502


In [58]:
display(pd.read_csv(PATH/"full_train_ds.csv").head(2))
display(pd.read_csv(PATH/"test_ds.csv").head(2))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,22256635,"Nonsense? kiss off, geek. what I said is true. I'll have your account terminated.",1,0,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you did with this edit to W. S. Merwin. If you continue to do so, you will be blocked from editing. """,0,0,0,0,0,0,1


Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. SS500 .jpg)==
1,6102620,"::Kentuckiana is colloquial. Even though the area is often referred to as this, it (in my opinion) has never held the encyclopedic precision of ""Louisville metropolitian area"", which has a specific U.S. Census definition. Also, apparently Kentuckiana often refers to the local television viewing area, which isn't nearly contiguous with the official metro area. As you indicate, Kentuckiana seems to be more of a slang or marketing phenomena than anything we could pin down in encyclopedic terms here. That's why we see Wikipedia language like ""the Louisville metropolitan area, sometimes referred to as Kentuckiana"". That's my take on it. — •"


Clean and tokenize documents

In [59]:
max_features = 100000 #30000
min_freq = 10 #0
max_len = 175 #100

In [60]:
chunksize = 24000

In [61]:
trn_df = pd.read_csv(PATH/'train_ds.csv', chunksize=chunksize)
val_df = pd.read_csv(PATH/'valid_ds.csv', chunksize=chunksize)
test_df = pd.read_csv(PATH/'test_ds.csv', chunksize=chunksize)

In [62]:
trn_docs, trn_labels = process_examples(trn_df, [txt_col], label_cols, lbl_dtype=np.float32)
val_docs, val_labels = process_examples(val_df, [txt_col], label_cols, lbl_dtype=np.float32)

0
1
2
3
0


In [63]:
len(trn_docs), len(val_docs), trn_labels[0].shape, val_labels[0].shape

(91058, 4793, (6,), (6,))

In [64]:
freq = Counter(p for o in trn_docs for p in o)
freq.most_common(10)

[('xfld', 546348),
 ('0', 444487),
 ('.', 297272),
 ('the', 283109),
 (',', 269182),
 ('"', 214611),
 ('1', 196267),
 ('to', 169607),
 ('\n', 140276),
 ('i', 137007)]

Build vocab, fix lengths of each document, and numericalize

In [65]:
all_toks = []
for doc in np.concatenate((trn_docs, val_docs)): all_toks += doc

In [66]:
vocab = Vocab(all_toks, min_freq, max_features)

In [67]:
trn_docs = [ d[:max_len] + ['<pad>']*(max_len-len(d)) for d in trn_docs ]
val_docs = [ d[:max_len] + ['<pad>']*(max_len-len(d)) for d in val_docs ]

In [68]:
trn_toks = np.array([ [vocab.stoi[o] for o in p] for p in trn_docs ])
val_toks = np.array([ [vocab.stoi[o] for o in p] for p in val_docs ])

In [69]:
len(trn_toks), len(val_toks), len(trn_toks[0])

(91058, 4793, 175)

In [70]:
trn_labels = np.squeeze(trn_labels)
val_labels = np.squeeze(val_labels)

In [71]:
trn_labels.shape, val_labels.shape

((91058, 6), (4793, 6))

Build datasets and data loaders

In [72]:
bsz = 64
pretrained_vectors = None #'fasttext.en.300d'

In [73]:
trn_ds = TextDataset(trn_toks, trn_labels)
val_ds = TextDataset(val_toks, val_labels)

trn_samp = SortishSampler(trn_toks, key=lambda x: len(trn_toks[x]), bs=bsz//2)
val_samp = SortSampler(val_toks, key=lambda x: len(val_toks[x]))

trn_dl = DataLoader(trn_ds, bsz//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bsz, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)

md = ModelData(PATH, trn_dl, val_dl)

In [74]:
len(trn_ds), trn_dl.batch_size, len(trn_dl), len(trn_dl.dataset), len(trn_ds[0][0]), trn_ds[0][1]

(91058, 32, 2846, 91058, 175, array([0., 0., 0., 0., 0., 0.], dtype=float32))

In [75]:
x, y = next(iter(trn_dl))

In [76]:
print(x.size(), x.type(), y.size(), y.type(), bsz)

torch.Size([175, 32]) torch.LongTensor torch.Size([32, 6]) torch.FloatTensor 64


In [77]:
# x, y