In [1]:
from fastai.text import *
from fastai.lm_rnn import *
import sentencepiece as sp

In [2]:
BTW17 = Path("../data/btw17")
germeval2017  = Path("../data/germeval2017")
WORK = Path("../work/")
!ls {WORK}
!ls {germeval2017}/

btw-nouniq30k  ge2017  shared  shared-wiki  wiki30k
dev_v1.4.tsv  test_TIMESTAMP1.tsv  test_TIMESTAMP2.tsv	train_v1.4.tsv
dev_v1.4.xml  test_TIMESTAMP1.xml  test_TIMESTAMP2.xml	train_v1.4.xml


In [3]:
wiki=Path("../data/wiki/de")
!ls {wiki}

train.csv  train.txt  val.csv


# Prepare Wiki LM text

In [37]:
#train_wiki = pd.read_csv(wiki/"train.csv", header=None)
#np.savetxt(wiki/'train.txt', train_wiki.values, fmt="%s")
#!cat {BTW17}/text.txt >> {wiki}/train.txt

# Prepare germeval 2017 to use wiki

In [4]:
def read_germeval(csvfn):
    cols=["id", "text", "relevance", "sentiment", "aspect:polarity"]
    df = pd.read_csv(germeval2017/csvfn , delimiter="\t", header=None, names=cols, index_col=False)
    df.sentiment = pd.Categorical(df.sentiment, categories=['neutral', 'negative', 'positive'])
    df.relevance = pd.Categorical(df.relevance, categories=[False, True])
    sen = pd.DataFrame(df.text)
    sen['sentiment'] = df.sentiment.cat.codes
    rel = pd.DataFrame(df.text)
    rel['relevance'] = df.relevance.cat.codes
    return sen,rel

# sen_trn,_ = read_germeval('train_v1.4.tsv')
# sen_val,_ = read_germeval('dev_v1.4.tsv')
# sen_test1,_ = read_germeval('test_TIMESTAMP1.tsv')
# sen_test2,_ = read_germeval('test_TIMESTAMP2.tsv')


In [5]:
sen,_ = read_germeval('dev_v1.4.tsv')


In [6]:
import concurrent.futures
spp = sp.SentencePieceProcessor()
p = WORK/'wiki30k'
spp.Load(str( p/ 'tmp' / 'sp.model'))
vs = spp.GetPieceSize() #len(itos)
spp.SetEncodeExtraOptions("bos:eos")
def toIds_raw(l):
    return spp.EncodeAsIds(str(l))   # to cover for NaN like on line 164

def tokenize(lines):
    WORKERS=num_cpus()
    results=[]
    chunksize=max(1,int(len(lines)/100/WORKERS))
    with concurrent.futures.ProcessPoolExecutor() as e:
        for result in e.map(toIds_raw, lines, chunksize=chunksize):
            results.append(np.array(result))
    return np.array(results)

In [7]:
sen['tok'] = tokenize(sen['text'])
sen['tok'][2]

array([    2,    21,   184,   279,  2155,    21,  1968,  1791,   534,   450,    18, 15560,    16,   140,
          14,  2801,  6913,     9, 10759,  3519,    18,   822,     6,     5,  5136,     4,  3807,     4,
       19985,     4,   119, 18201,    83,    34,    47,    52,    17,     4,    50,    49,   226,  4504,
          87,   110,   165,    79,   203,   183,   187,     3])

In [39]:
# trn, val, test
germeval2017_tmp = WORK/'wikige2017'/'tmp'
germeval2017_tmp.mkdir(exist_ok=True, parents=True)
!cp {p/ 'tmp' / 'sp.model'} {germeval2017_tmp / 'sp.model'} 
!cp {p/ 'tmp' / 'sp.vocab'} {germeval2017_tmp / 'sp.vocab'} 

def convert_germeval(fn, set_name):
    sen,_ = read_germeval(fn)
    sen['tok'] = tokenize(sen['text'])
    np.save(germeval2017_tmp/f'{set_name}_ids.npy' ,np.array(sen['tok']))
    np.save(germeval2017_tmp/f'lbl_{set_name}.npy' ,np.array(sen['sentiment']))
    return sen['tok']
# 'tmp/val_{IDS}_bwd.npy'
# 'tmp/trn_{IDS}.npy'
# 'tmp/val_{IDS}.npy'
# 'tmp/lbl_trn{train_file_id}.npy'
# 'tmp/lbl_val.npy'

In [None]:
# sen_trn,_ = read_germeval('train_v1.4.tsv')
# sen_val,_ = read_germeval('dev_v1.4.tsv')
# sen_test1,_ = read_germeval('test_TIMESTAMP1.tsv')
# sen_test2,_ = read_germeval('test_TIMESTAMP2.tsv'

In [10]:
all_text = [[]]*4

In [11]:
all_text[0] = convert_germeval('train_v1.4.tsv', 'trn')

In [12]:
all_text[1] =  [] # do not add val set to all text
val = convert_germeval('dev_v1.4.tsv', 'val')

In [13]:
cols=["id", "text", "relevance", "sentiment", "aspect:polarity"]
df = pd.read_csv(germeval2017/'test_TIMESTAMP1.tsv' , delimiter="\t", header=None, names=cols)

In [14]:
all_text[2] = convert_germeval('test_TIMESTAMP1.tsv', 'test1')

In [15]:
all_text[3] = convert_germeval('test_TIMESTAMP2.tsv', 'test2')

In [16]:
[np.array(a).shape for a in all_text]

[(20941,), (0,), (2566,), (1842,)]

In [17]:
all_text_np = np.concatenate(all_text)

In [18]:
np.save(germeval2017_tmp/f'trn_ids_all.npy' ,all_text_np)

In [23]:
! head -n 165 {germeval2017}/dev_v1.4.tsv | tail -n 1

https://plus.google.com/106967535142671617878/posts/5x5NBQhH2Pv	null	false	neutral


# Sentence piece statistics

In [30]:
print("Maximal length of sentence in tokens", max([len(s) for s in val]))
print("AVG Length in tokens", sum([len(s) for s in val])/len(val))
print("Length of valuation set", len(val))

Maximal length of sentence in tokens 6663
AVG Length in tokens 136.91369969040247
Length of valuation set 2584
