# Build dictionary on Phoenix dataset

## Util functions

In [2]:
import pandas as pd
import spacy
from collections import Counter
import time

def build_dictionary(file):
    start = time.time()
    lang_model = spacy.load('de_core_news_sm')
    end = time.time()
    print('load lang_model cost %.3f s'%(end-start))
    train = []
    # 合并annotation中的语料
    df = pd.read_csv(file,sep='|')
    for i in range(len(df)):
        train.append(df.loc[i]['annotation'])

    # Create a dictionary which maps tokens to indices (train contains all the training sentences)
    freq_list = Counter()
    punctuation = ['_','NULL','ON','OFF','EMOTION','LEFTHAND','IX','PU']
    for sentence in train:
        sentence = [tok.text for tok in lang_model.tokenizer(sentence) if not tok.text in punctuation]
        freq_list.update(sentence)

    # 按照词的出现频率建立词典，词频越高索引越靠前
    freq_list = sorted(freq_list.items(),key=lambda item:item[1],reverse=True)
    dictionary = {}
    dictionary['<pad>'] = 0
    dictionary['<bos>'] = 1
    dictionary['<eos>'] = 2
    dictionary['<unk>'] = 3
    count = 0
    for i,item in enumerate(freq_list):
        if item[1] >= 2:
            dictionary[item[0]] = count+4
            count += 1
        else:
            dictionary[item[0]] = 3
    print("Build dictionary successfully!")
    return dictionary

def process_sentence(sentence):
    punctuation = ['_','NULL','ON','OFF','EMOTION','LEFTHAND','IX','PU']
    sentence = [tok.text for tok in lang_model.tokenizer(sentence) 
        if not tok.text in punctuation]
    # sentence = ['<bos>'] +   sentence + ['<eos>']
    print(sentence)
    indices = [dictionary[word] for word in sentence 
        if word in dictionary.keys()]
    return indices

In [3]:
train_file = "corpus/train.corpus.csv"
dictionary = build_dictionary(train_file)

load lang_model cost 1.232 s


TypeError: 'tuple' object is not callable

## Find OOV words

In [2]:
train_file = '/Users/liweijie/SLR/scripts/corpus/train.corpus.csv'
train_dict = build_dictionary([train_file])
dev_file = '/Users/liweijie/SLR/scripts/corpus/dev.corpus.csv'
dev_dict = build_dictionary([dev_file])
test_file = '/Users/liweijie/SLR/scripts/corpus/test.corpus.csv'
test_dict = build_dictionary([test_file])
out_of_vocab = []
for k,v in dev_dict.items():
    if not k in train_dict.keys():
        out_of_vocab.append(k)
for k,v in test_dict.items():
    if not k in train_dict.keys() and not k in dev_dict.keys():
        out_of_vocab.append(k)
out_of_vocab

load lang_model cost 1.262 s
Build dictionary successfully!
load lang_model cost 0.955 s
Build dictionary successfully!
load lang_model cost 0.971 s
Build dictionary successfully!


[' ',
 'E+R+Z',
 'NICHT-WOLKE',
 'S+H',
 'C+M',
 'HARZ',
 'SECHZIG',
 'HINREICHEND',
 'MEIN',
 'SICHT',
 'RHEIN-PFALZ',
 'WOANDERS',
 'K+U+N+A',
 'MITBRINGEN',
 'ZWEIHUNDERT',
 'WITTERUNG',
 'ZEHN-STUNDEN',
 'RUHIGER',
 'NASE',
 'HEINS',
 'SECHSZEHNTE',
 'ORIENTIEREN',
 'EINHALB',
 'NICHT-MOEGEN',
 'STUFENWEISE',
 'WEISS',
 'GAENSEFUSS',
 'ZWEITAUSEND',
 'UEBERSPRINGEN',
 'SEIN',
 'MAL-SO',
 'MANNHEIM',
 'SLOWAKEI',
 'BEIDE',
 'MOECHTEN',
 'ENTTAEUSCHT',
 'KREISEN',
 'MENSCHEN',
 'FROH',
 'KNOSPE-ABFALLEN',
 'ERNTE',
 'O+P+H+E+L+I+A',
 'ABKUEHLEN',
 'POSITION',
 'BILD',
 'NICHT-WAHRSCHEINLICH',
 'ANKLICKEN',
 'AB-SO',
 'VERLAENGERN',
 'VESCHWINDEN',
 'VON-UNTEN',
 'NACH-NORD',
 'S+L+Y',
 'DAS-WARS',
 'TRINKEN',
 'SELBST',
 'ENTSCHEIDUNG',
 'NICHT-IN-KOMMEND',
 'SAEGE',
 'GARTEN',
 'AUTOBAHN',
 'ZWISCHEN-MITTE',
 'NEBEN',
 'BESTE',
 'SCHNEEVERWEHUNG']