In [1]:
#hide
%reload_ext autoreload
%autoreload 2

from fastai2.basics import *
from fastai2.text.all import *
from fastai2.callback.all import *
from fastai2.data.transforms import RandomSplitter

from nlp import load_dataset
from pprint import pprint
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast, RobertaTokenizer

# Fastai + DataFrame

## Grab Data

In [2]:
senti_train_dataset = load_dataset('sentiment140', split='train', download_mode='reuse_cache_if_exists')
df=senti_train_dataset.data.to_pandas()
df.to_csv('sentiment140.csv')
df.head()

Downloading and preparing dataset sentiment140/sentiment140 (download: 77.59 MiB, generated: 214.21 MiB, total: 291.81 MiB) to /home/morgan/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset sentiment140 downloaded and prepared to /home/morgan/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0. Subsequent calls will reuse this data.


Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY


In [2]:
df=pd.read_csv('sentiment140.csv')
print(len(df))
df.head()

Unnamed: 0.1,Unnamed: 0,text,date,user,sentiment,query
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY


Fast Tok

In [3]:
model_name='roberta-base'

In [5]:
tokenizer=RobertaTokenizerFast.from_pretrained(model_name); tokenizer
tokenizer_vocab=tokenizer.get_vocab() 
tokenizer_vocab_ls = [k for k, v in sorted(tokenizer_vocab.items(), key=lambda item: item[1])]

In [41]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, o): 
        return tensor(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(o[0])))
    def decodes(self, o): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))
       
class myLabel(Transform):
    def __init__(self): pass
    def encodes(self, o): return o[1]

## Create datasets

In [42]:
#%%timeit -n 1 -r 1  # 1% : 81.7ms, 10% : 448ms, 100% : 

frac=0.1
n = int(len(df) * frac)
samp = df.sample(n)
print(n)

splits = [list(range(len(samp))),list(range(len(samp)))]

vcb=samp['sentiment'].unique()

tfms = [[TransformersTokenizer(tokenizer)],
        [myLabel(), Categorize(vocab=vcb)]]

dsets = Datasets(samp[['text', 'sentiment']].values, tfms, splits=splits, dl_type=SortedDL)

160000


## Create Dataloaders

`dsets.dataloaders` running on all CPUs as expected

In [43]:
#%%timeit -n 1 -r 1  # 1% : 11.7s, 10% : 128s, 100% : 
bs=64
dls = dsets.dataloaders(bs=bs, before_batch=pad_input,  device='cuda')

o=dls.one_batch()
o[0].size(), o[1].size(), dls.device

(torch.Size([64, 469]), torch.Size([64]), 'cuda')

In [153]:
%%timeit -n 7 -r 1   # 1% : 1.45s (3r), 10% : 9.39s (3r), 100% : 68s
for i,b in enumerate(dls.train):
    if i % 1000 ==0 : print(i)

0
0
0
0
0
0
0
1.45 s ± 0 ns per loop (mean ± std. dev. of 1 run, 7 loops each)


# PyArrow / nlp 

In [44]:
class HfTokenize(Transform):
    def __init__(self,hfdset, tokenizer):
        self.hfdset, self.tokenizer, self.max_len=hfdset,tokenizer,tokenizer.max_len_single_sentence
    def encodes(self, i): return TensorText(self.hfdset[i]['input_ids'][:self.max_len])
    def decode(self, o=None, split_idx=None): return TitledStr(self.tokenizer.decode(list(o)))

class HfLabel(Transform):
    def __init__(self,hfdset): self.hfdset=hfdset
    def encodes(self, i): return int(self.hfdset[i]['sentiment'])
    
def convert_to_features(example_batch):
    encodings = tokenizer.batch_encode_plus(example_batch['text'], pad_to_max_length=False)
    return encodings

## Get the data

In [45]:
senti_dataset = load_dataset('sentiment140', split='train[:10%]')#, download_mode='reuse_cache_if_exists')

In [46]:
senti_dataset = senti_dataset.map(convert_to_features, batched=True)

In [47]:
senti_dataset.set_format(type='torch', columns=['input_ids','sentiment'])

In [48]:
senti_dataset, senti_dataset[0]['input_ids'], senti_dataset[0]['sentiment']

(Dataset(schema: {'text': 'string', 'date': 'string', 'user': 'string', 'sentiment': 'int64', 'query': 'string', 'input_ids': 'list<item: int64>', 'attention_mask': 'list<item: int64>'}, num_rows: 160000),
 tensor([    0,  1039, 43067,  2917,  2054,   640, 17137,   405, 19017,     4,
           175,    73,   176,   219,   134, 30094,   111,    83,  1401,     6,
            14,    18,    10,   741, 22539,     4,  1437,   370,   197,   102,
           300,   871,  8902,     9,  7470,  1053,     7,   109,    24,     4,
         25606,   495,     2]),
 tensor(0))

## Create datasets

In [49]:
#splits = RandomSplitter()(range(len(senti_dataset)))
print(len(senti_dataset))
splits = [list(range(len(senti_dataset))),list(range(len(senti_dataset)))]

vcb=np.unique(senti_dataset['sentiment'])
print(vcb)

tfms = [[HfTokenize(senti_dataset, tokenizer)],
        [HfLabel(senti_dataset), Categorize(vocab=vcb)]]

dsets = Datasets(range(len(senti_dataset)), tfms, splits=splits, dl_type=SortedDL)

160000
[0]


## Create dataloaders

Only runs on 1 CPU...

In [50]:
#%%timeit -n 1 -r 1  # 1% : 12s, 10% : 129s, 100% : 2320s
bs = 64
dls = dsets.dataloaders(bs=bs, before_batch=pad_input, device='cuda')

o=dls.one_batch()
o[0].size(), o[1].size(), o[0].device

2min 7s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [146]:
%%timeit -n 7 -r 1  # 1%: 1.52s
for i,b in enumerate(dls.train):
    if i % 1000 ==0 : print(i)

0
0
0
0
0
0
0
1.52 s ± 0 ns per loop (mean ± std. dev. of 1 run, 7 loops each)
