# ULMFiT + Siamese Network for Sentence Vectors
## Part One: Tokenizing
This notebook will tokenize the sentences from the OFFICE FAQ for use in the training of the Language Model (LM) and the InferSent network.


In [1]:
# Needed to load fastai library
import sys
sys.path.append("/data/home/makayser/notebooks/fastai/") # go to parent dir

In [2]:
from fastai.text import *
import json
import html
import re
import pickle
import random
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import model_selection
from functools import partial
from collections import Counter, defaultdict

import numpy as np
import torch
import torch.nn as nn
import torch.utils 
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import dataset, dataloader
import torch.optim as optim
import torch.nn.functional as F

import time
import math
import sys
import data

In [3]:
data_dir = '/data/home/makayser/qa_local/'
token_files = data_dir + 'token/'

fp_de_cl = data_dir + "office_help_de_cl.txt"
fp_de_lm = data_dir + "office_help_de_lm.txt"
fp_de_clean = data_dir + "office_help_de_clean.txt"

In [4]:
# load and process the all the sentences, just to get the LM trained
df = pd.read_csv(fp_de_lm, sep='\t', lineterminator='\r', encoding='utf-8'); len(df)

58827

In [5]:
df.head()

Unnamed: 0,id,eid,text,apps,length
0,1,1,Ändern der Ansicht des Outlook-Kalenders,Outlook,40
1,1,2,Ändern der Ansicht des Outlook-Kalenders,Outlook,40
2,1,3,Ändern der Ansicht des Outlook-Kalenders,Outlook,40
3,1,4,Ändern der Ansicht des Outlook-Kalenders,Outlook,40
4,1,5,Ändern der Ansicht des Outlook-Kalenders,Outlook,40


In [6]:
df.apps.drop_duplicates()

0             Outlook
6              Office
7               Excel
25               Lync
42         PowerPoint
48             Access
51               Word
60            Project
127             Visio
226        SharePoint
493          Kalender
543         Publisher
685           OneNote
744          InfoPath
1465              NaN
1634            Skype
1642        Microsoft
1817         OneDrive
2044             Duet
2405               MS
3813             Sway
4471             Mail
13071    Communicator
Name: apps, dtype: object

In [7]:
raw_text = df.text.tolist()
print(len(raw_text))

58827


In [8]:
#split the language model data into train and validation sets
lm_train, lm_valid = sklearn.model_selection.train_test_split(raw_text, test_size=0.1)
df_trn = pd.DataFrame(lm_train)
df_val = pd.DataFrame(lm_valid)

In [9]:
BOS = 'x_bos'  # beginning-of-sentence tag

re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    x = ' '.join([out for out in x.split(' ') if len(out)<30])
    return re1.sub(' ', html.unescape(x))

def get_texts(df):
    texts = f'{BOS} ' + df[0].astype(str)
    texts = list(texts.apply(fixup).values)     
    tok = Tokenizer(lang='de', n_cpus=6).process_all(texts)
    return tok

In [10]:
%%time

tok_trn = np.concatenate(get_texts(df_trn))
tok_val = np.concatenate(get_texts(df_val))

CPU times: user 1.9 s, sys: 491 ms, total: 2.39 s
Wall time: 8.97 s


In [11]:
tok_val[:100]

array(['x_bos', 'benötigen', 'sie', 'hilfe?|', ..., 'können', 'in', 'einer', 'sharepoint-dokumentbibliothek'],
      dtype='<U29')

In [12]:
len(tok_val)

279673

In [13]:
#save our work
np.save(f'{token_files}tok_trn.npy', tok_trn)
np.save(f'{token_files}tok_val.npy', tok_val)

In [14]:
tok_trn = np.load(f'{token_files}tok_trn.npy')
tok_val = np.load(f'{token_files}tok_val.npy')

In [15]:
freq = Counter(np.concatenate([tok_trn, tok_val]))
freq.most_common(25)

[(',', 141040),
 ('sie', 134041),
 ('.', 109489),
 ('die', 74430),
 ('x_bos', 58827),
 ('der', 58361),
 ('in', 54619),
 ('und', 53781),
 ('auf', 50037),
 ('oder', 37544),
 ('von', 35697),
 ('xxup', 28005),
 ('für', 26787),
 ('können', 26091),
 ('klicken', 24133),
 ('zu', 23892),
 ('eine', 23541),
 ('das', 22125),
 ('den', 21585),
 ('einer', 21361),
 ('mit', 21318),
 ('wenn', 20678),
 ('ein', 17561),
 ('"', 16752),
 ('werden', 16578)]

In [16]:
len(freq)

41010

In [17]:
max_vocab = 60000
min_freq = 1
itos = [o for o, c in freq.most_common(max_vocab) if c>=min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

In [18]:
len(stoi)

41012

# Save the language model training set

In [19]:
trn_lm = np.array([stoi[p] for p in tok_trn])
val_lm = np.array([stoi[p] for p in tok_val])

In [20]:
#save results
pickle.dump(itos, open(f'{token_files}itos.pkl', 'wb'))
np.save(f'{token_files}trn_lm.npy', trn_lm)
np.save(f'{token_files}val_lm.npy', val_lm)

In [21]:
#load the results so we can pick it up from here 
itos = pickle.load(open(f'{token_files}itos.pkl', 'rb'))
trn_lm = np.load(f'{token_files}trn_lm.npy')
val_lm = np.load(f'{token_files}val_lm.npy')

stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
vocab_size = len(itos)
vocab_size

41012

In [22]:
for word in val_lm[:100]:
    print(itos[word], end=" ")

x_bos benötigen sie hilfe?| sicherstellen , dass sie über administratorrechte auf ihrem computer verfügen x_bos offline arbeiten in outlook x_bos sie können mithilfe der datenüberprüfung den typ der daten oder die werte einschränken , die benutzer in eine zelle eingeben . einer der häufigsten einsatzzwecke für die datenüberprüfung ist das erstellen einer dropdownliste . x_bos open another person 's exchange contacts x_bos word| x_bos erstellen eines power view-berichts in sharepoint starten von power view zum erstellen eines power view-berichts aus einer datei datenmodell in sharepoint server 2010 oder 2013 . modelle oder verbindungen zu datenmodellen , können in einer sharepoint-dokumentbibliothek 

# Build the sentence similarity dataset

In [23]:
# load and process the all the sentences, just to get the LM trained
df_cl = pd.read_csv(fp_de_cl, sep='\t', lineterminator='\r', encoding='utf-8'); len(df_cl)

77690

In [24]:
df_cl.head()

Unnamed: 0,id,eid,apps,sent1,sent2,length1,length2
0,2,1,Office,Ändern der Anzeigesprache und Zeitzone in Offi...,Sie können die Anzeigesprache und Zeitzone für...,9,14
1,3,1,Excel,Ändern der Anzeige von einem 3D-Diagramm,"In einem 3D-Diagramm wie 3D-Säulen, 3D-Linie o...",6,110
2,3,2,Excel,Ändern der Anzeige von einem 3D-Diagramm,"In einem 3D-Diagramm wie 3D-Säulen, 3D-Linie o...",6,110
3,3,3,Excel,Ändern der Anzeige von einem 3D-Diagramm,"In einem 3D-Diagramm wie 3D-Säulen, 3D-Linie o...",6,110
4,3,4,Excel,Ändern der Anzeige von einem 3D-Diagramm,"In einem 3D-Diagramm wie 3D-Säulen, 3D-Linie o...",6,110


In [25]:
from enum import Enum

class Entail(Enum):
    Outlook = 0
    Office = 1
    Excel = 2
    Lync = 3
    PowerPoint = 4
    Access = 5
    Word = 6
    Project = 7
    Visio = 8
    SharePoint = 9
    Kalender = 10
    Publisher = 11
    OneNote = 12
    InfoPath = 13
    Communicator = 14
    Skype = 15
    Microsoft = 16
    OneDrive = 17
    Duet = 18
    MS = 19
    Sway = 20
    Mail = 21

    
##TODO: drop NaN from categories
def fixup_cl(data, col):
    texts = f'{BOS} ' + data[col].astype(str)
    texts = list(texts.apply(fixup).values)     
    return texts

def load_sentence_pairs(df):
    lbls = df_cl["apps"].values
    s0s = fixup_cl(df_cl, "sent1") #BOS+" "+fixup(item['sentence1'])
    s1s = fixup_cl(df_cl, "sent2") #BOS+" "+fixup(item['sentence2'])
    
    labels = []
    avg_len = []
    for l, s0, s1 in zip(lbls,s0s,s1s):
        average_len = (len(s0)+len(s1))/2
        try:
            labels.append(Entail[l].value)
            avg_len.append(average_len)
        except Execption as e: #KeyError
            print(str(e))
            pass
        
    s0s = Tokenizer(lang='de', n_cpus=6).process_all(s0s) #Tokenizer().proc_all_mp(partition_by_cores(s0s))
    s1s = Tokenizer(lang='de', n_cpus=6).process_all(s1s) #Tokenizer().proc_all_mp(partition_by_cores(s1s))
    return np.array((s0s, s1s, labels, avg_len)).transpose()    


In [26]:
#split the language model data into train and validation sets
cl_train, cl_dev = sklearn.model_selection.train_test_split(df_cl, test_size=0.1)
cl_train, cl_test = sklearn.model_selection.train_test_split(cl_train, test_size=0.1)
df_cl_train = pd.DataFrame(cl_train); print(len(df_cl_train))
df_cl_dev = pd.DataFrame(cl_dev); print(len(df_cl_dev))
df_cl_test = pd.DataFrame(cl_test); print(len(df_cl_test))

62928
7769
6993


In [27]:
sentence_pairs_train = load_sentence_pairs(df_cl_train)
print('done train')
sentence_pairs_dev = load_sentence_pairs(df_cl_dev)
print('done dev')
sentence_pairs_test = load_sentence_pairs(df_cl_test)

done train
done dev


In [28]:
np.save(f'{token_files}trn_office.npy', sentence_pairs_train)
np.save(f'{token_files}dev_office.npy', sentence_pairs_dev)
np.save(f'{token_files}test_offuce.npy', sentence_pairs_test)

In [29]:
def tokenize(sentence_pairs):
    for i in range(len(sentence_pairs)):
        item = sentence_pairs[i]
        tok0 = [stoi[p] for p in item[0]]
        tok1 =[stoi[p] for p in item[1]]
        sentence_pairs[i] = np.array([tok0, tok1, item[2], item[3]])

tokenize(sentence_pairs_train)
tokenize(sentence_pairs_dev)
tokenize(sentence_pairs_test)

In [30]:
np.save(f'{token_files}office_tok_train.npy', sentence_pairs_train)
np.save(f'{token_files}office_tok_dev.npy', sentence_pairs_dev)
np.save(f'{token_files}office_tok_test.npy', sentence_pairs_test)

# Test

In [31]:
itos = pickle.load(open(f'{token_files}itos.pkl', 'rb'))

dev = np.load(f'{token_files}office_tok_dev.npy')
train = np.load(f'{token_files}office_tok_train.npy')
test = np.load(f'{token_files}office_tok_test.npy')

def print_sentence(s):
    sentence = ""
    for tok in s:
        sentence += " "+itos[tok]
    print(sentence)

print_sentence(train[0][0])
print_sentence(train[0][1])

print_sentence(dev[0][0])
print_sentence(dev[0][1])

print_sentence(test[0][0])
print_sentence(test[0][1])


 x_bos ändern der anzeigesprache und zeitzone in office 365 business
 x_bos sie können die anzeigesprache und zeitzone für alle ihre office   365-apps und -dienste gleichzeitig ändern .
 x_bos ändern der anzeigesprache und zeitzone in office 365 business
 x_bos sie können die anzeigesprache und zeitzone für alle ihre office   365-apps und -dienste gleichzeitig ändern .
 x_bos ändern der anzeigesprache und zeitzone in office 365 business
 x_bos sie können die anzeigesprache und zeitzone für alle ihre office   365-apps und -dienste gleichzeitig ändern .
