In [1]:
from fastai.text import TextLMDataBunch as lmdb
from fastai.text.transform import Tokenizer
import pandas as pd
from pathlib import Path
from random import shuffle

### Get list of pre-processed data

In [3]:
p = Path('data/mdparsed')
files = p.ls()
shuffle(files)
files[:10]

[PosixPath('data/mdparsed/processed_part0057.csv'),
 PosixPath('data/mdparsed/processed_part0043.csv'),
 PosixPath('data/mdparsed/processed_part0024.csv'),
 PosixPath('data/mdparsed/processed_part0054.csv'),
 PosixPath('data/mdparsed/processed_part0021.csv'),
 PosixPath('data/mdparsed/processed_part0093.csv'),
 PosixPath('data/mdparsed/processed_part0001.csv'),
 PosixPath('data/mdparsed/processed_part0002.csv'),
 PosixPath('data/mdparsed/processed_part0096.csv'),
 PosixPath('data/mdparsed/processed_part0032.csv')]

### Read in Data

In [24]:
# valid_df = pd.concat(pd.read_csv(f) for f in files[:10])
# train_df = pd.concat(pd.read_csv(f) for f in files[10:])

In [25]:
# valid_df.to_pickle('valid_df.pkl')
# train_df.to_pickle('train_df.pkl')

In [None]:
valid_df = pd.read_pickle('valid_df.pkl').dropna().drop_duplicates()
train_df = pd.read_pickle('train_df.pkl').dropna().drop_duplicates()

In [None]:
print(f'rows in train_df:, {train_df.shape[0]:,}')
print(f'rows in valid_df:, {valid_df.shape[0]:,}')

In [None]:
train_df.head()

## Create Model

#### Instantiate Your Own Tokenizer

Because we already did lots of pre-procesing in `Test_Concurrency.ipynb`

In [None]:
def pass_through(x):
    return x

In [None]:
# only thing is we are changing pre_rules to be pass through since we have already done all of the pre-rules.  
# you don't want to accidentally apply pre-rules again otherwhise it will corrupt the data.
tokenizer = Tokenizer(pre_rules=[pass_through], n_cpus=30)

Specify path for saving language model artifacts

In [None]:
# !mkdir lang_model

In [None]:
path = Path('lang_model/')
path.absolute()

In [8]:
# Note you want your own tokenizer, without pre-rules
data_lm = lmdb.from_df(path=path,
                       train_df=train_df,
                       valid_df=valid_df,
                       text_cols='text',
                       tokenizer=tokenizer,
                       chunksize=6000000)

In [9]:
data_lm.save() # saves to self.path/data_save.pkl

From the docs for [load_data](https://docs.fast.ai/basic_data.html#load_data):

```
 Important: The arguments you passed when you created your first `DataBunch` aren't saved, so you should pass them here if you don't want the default.
```

In [13]:
!ls -lah lang_model/

total 27G
drwxr-xr-x 2 root root 6.0K May 14 03:59 .
drwxr-xr-x 6 root root 6.0K May 14 03:59 ..
-rw-r--r-- 1 root root  27G May 14 03:57 data_save.pkl


# Notes

If you don't have lots of data, you probably want to avoid multi-processing

In [69]:
%%time
tokenizer.process_all(['Hello world, I am here!! weee.'])

CPU times: user 24 ms, sys: 20.7 s, total: 20.7 s
Wall time: 22.3 s


[['xxmaj', 'hello', 'world', ',', 'i', 'am', 'here', '!', '!', 'weee', '.']]

In [40]:
%%time
tokenizer._process_all_1(['Hello world, I am here!! weee,'])

CPU times: user 56 ms, sys: 4 ms, total: 60 ms
Wall time: 60.9 ms


[['xxmaj', 'hello', 'world', ',', 'i', 'am', 'here', '!', '!', 'weee', ',']]

In [33]:
data_lm

TextLMDataBunch;

Train: LabelList (16385650 items)
x: LMTextList
xxbos xxxfldtitle xxunk throws exception when adding product via xxup api xxxfldbody xxmaj if xxmaj google xxmaj contents xxmaj experiments is enabled , adding a product using the xxxcdb xxup v1 / products / xxxcde endpoint causes xxunk xxxfilepath line 117 to throw xxxcdb invalidargumentexception xxxcde . xxmaj with contents experiments disabled the product add completes successfully . xxmaj example valid request json : 
  xxxcdb " product " : xxxjson 
  xxxcde,xxbos xxxfldtitle xxmaj grafana xxmaj kairosdb xxmaj top n rows xxxfldbody xxmaj when xxmaj grafana pulls the data it shows all the rows returned by the query , so can we please have a query option with something like xxunk ? xxmaj so we can only show limited number of rows on xxmaj grafana unlike now it just gets and shows everything . i have also requested a xxup ui option with xxmaj grafana guys where we can choose the number of rows we want to see under the g

In [25]:
next(iter(data_lm.train_dl))

[tensor([[  88,   14,   28,  ...,   56,  125, 1370],
         [   0,   33,   25,  ...,    5,   92,  242],
         [  48,   14,   76,  ..., 1700,   16, 4579],
         ...,
         [ 112,  194,    9,  ..., 8712,   36,    9],
         [  13,   68,   30,  ...,   14,   27,  229],
         [   6, 2286, 3982,  ..., 1405,   65,   25]]),
 tensor([[   14,    28,     2,  ...,   125,  1370,    15],
         [   33,    25,     6,  ...,    92,   242,    53],
         [   14,    76,     5,  ...,    16,  4579,    25],
         ...,
         [  194,     9,   659,  ...,    36,     9,   287],
         [   68,    30,    10,  ...,    27,   229, 12254],
         [ 2286,  3982,    23,  ...,    65,    25,    72]])]

In [26]:
tstbatch = _

In [29]:
#first x
tstbatch[0][0]

tensor([   88,    14,    28,     2,    22,     5,  3332,     5, 55055,     5,
          504,   678,  1535,    23,     5,    51,     5,  3332,  5974,     9,
          110,    30,   457,    83,     9,  1535,   825,    75,     9,   462,
           11,    89,    57,    64,   146,    56,    15,   462,   244,    40,
          209,    87,     0,    43,     5,    89,    64,    57,   105,   251,
         1993,   260,    26,  1535,    36,     5,  3332,  5092,   179,    30,
          120,   668,    24,   457,   601,    10,    19,    56,   125,  1370])

In [30]:
#second batch of x
tstbatch[0][1]

tensor([    0,    33,    25,     6,   187,    17,     5,  1424,    13,  9788,
          207,     5,  3234,   518, 10835,   124,    61,  3076,    94,    25,
         3126,    37, 10881,    31,  2831,   262,  2003,    17,     2,    22,
          450,   159,   462,   340,    13,   901,    25,    34,  1432,   427,
           13,   133,   371,    34,    17,    23,    76,     5,   165,     5,
          308,    14,     5,   146,  1272,   216,   827,   238,  1245,   115,
           77,    10,     5,   200,    94,    14,    71,     5,    92,   242])

In [32]:
# first target
tstbatch[1][0]

tensor([   14,    28,     2,    22,     5,  3332,     5, 55055,     5,   504,
          678,  1535,    23,     5,    51,     5,  3332,  5974,     9,   110,
           30,   457,    83,     9,  1535,   825,    75,     9,   462,    11,
           89,    57,    64,   146,    56,    15,   462,   244,    40,   209,
           87,     0,    43,     5,    89,    64,    57,   105,   251,  1993,
          260,    26,  1535,    36,     5,  3332,  5092,   179,    30,   120,
          668,    24,   457,   601,    10,    19,    56,   125,  1370,    15])

In [23]:
data_lm.train_ds[0]

(Text xxbos xxxfldtitle xxunk throws exception when adding product via xxup api xxxfldbody xxmaj if xxmaj google xxmaj contents xxmaj experiments is enabled , adding a product using the xxxcdb xxup v1 / products / xxxcde endpoint causes xxunk xxxfilepath line 117 to throw xxxcdb invalidargumentexception xxxcde . xxmaj with contents experiments disabled the product add completes successfully . xxmaj example valid request json : 
   xxxcdb " product " : xxxjson 
   xxxcde, EmptyLabel )