In [None]:
import numpy as np
from nltk.tokenize import word_tokenize
import string
from collections import Counter
import regex as re

In [None]:
nli_file = '../data/shuf_allnli.txt'
out_file = '../data/nli_data_tokenized.txt'
nli_dev_file = '../data/nli_dev.txt'
out_dev_file = '../data/nli_dev_tokenized.txt'

cp_in_file = '../data/en.txt'
cp_lab_file = '../data/pt.txt'
cp_out_file = '../data/tree.txt'

vocab_file = '../data/vocab.txt'
cp_vocab_file = '../data/cp_label.txt'
cp_temp_file1 = '../data/cp_temp1.txt'
cp_temp_file2 = '../data/cp_temp2.txt'

nmt_english = '../data/english.txt'
nmt_german = '../data/german.txt'
nmt_temp1 = '../data/preprocessed_english.txt'
nmt_temp2 = '../data/preprocessed_german.txt'
nmt_out = '../data/nmt_data.txt'



In [None]:
vocab_size = 30000

In [None]:
def create_vocab(input_files, vocab_size): #pass a list of input files (already tokenized) to create the vocab on
    freq = Counter()
    for file in input_files:
        with open(file,'r') as fr:
            for index, line in enumerate(fr):
#                 if index == 10:
#                     break

#                 line = re.sub('([.,!?()#$%&/:;<=>?@{|}~\'\"])'  , r' \1 ', line)
    
                tokenized = line.strip().split()
#                 sen = line.strip().split('\t')
                for word in tokenized:
                    freq[word] += 1
                if index%100000 == 0:
                    print(index)
                    
    most_common_words = freq.most_common(vocab_size)
    print("total words: " + str(len(freq)))
    
    vocab = Counter()
    vocab['UNK'] = 0
    vocab['PAD'] = 1
    vocab['EOS'] = 2
    
    for i in range(vocab_size-3):
        vocab[most_common_words[i][0]] = i+2

    return vocab

In [None]:
def write_vocab_file(vocab_file, vocab):
    with open(vocab_file,'w') as fw:
        for word in vocab.keys():
            fw.write(word + '\n')

In [None]:
def NLI_preprocessing(nli_file, out_file):
    # with open(nli_file,'r') as fr, open(out1,'w') as fw1, open(out2,'w') as fw2, open(out3,'w') as fw3:
    with open(nli_file,'r') as fr, open(out_file,'w') as fw:
        for index, line in enumerate(fr):
            if index == 0:
                continue
            components = line.split('\t')
            sen1 = components[5].strip().lower()
            trantab = sen1.maketrans({key: None for key in string.punctuation})
            sen1 = word_tokenize(sen1.translate(trantab))

            sen2 = components[6].strip().lower()
            trantab = sen2.maketrans({key: None for key in string.punctuation})
            sen2 = word_tokenize(sen2.translate(trantab))

            label = components[0].strip().lower()
            fw.write(' '.join(sen1) + '\t' + ' '.join(sen2) + '\t' + label + '\n')
            if index % 100000 == 0:
                print(index)

In [None]:
def CP_preprocessing(sen_file, out_file):
        with open(sen_file, 'r') as fr1, open(out_file, 'w') as fw:
            count = 0
            for line1 in fr1:
                count += 1
#                 line2 = re.sub('([.,!?()#$%&/:;<=>?@{|}~\'\"])'  , r' \1 ', line2)
                line1 = re.sub('([.,!?()#$%&/:;<=>?@{|}~\'\"])'  , r' \1 ', line1)
                fw.write(line1)
                if count % 10000 == 0:
                    print(count)
        

In [None]:
def NMT_preprocessing(sen_file, out_file):
    with open(sen_file, 'r') as fr, open(out_file,'w') as fw:
        for index, line in enumerate(fr):
            line = line.strip().lower()
            trantab = line.maketrans({key: None for key in string.punctuation})
            sen1 = word_tokenize(line.translate(trantab))
            fw.write(' '.join(sen1) + '\n')
            if index%100000 == 0:
                print(index)

In [None]:
def join_files(f1, f2, of):
    with open(f1, 'r') as fr1, open(f2, 'r') as fr2, open(of,'w') as fw:
        count = 0
        for line1, line2 in (zip(fr1,fr2)):
            count += 1
            if line1.rstrip().strip() == '' or line2.rstrip().strip() == '':
                continue
            fw.write(line1.strip() + '\t' + line2.strip() + '\n')
            if count%10000 == 0:
                print(count)

In [None]:
NLI_preprocessing(nli_file, out_file)
NLI_preprocessing(nli_dev_file, out_dev_file)

In [None]:
CP_preprocessing(cp_in_file, cp_temp_file1)
CP_preprocessing(cp_lab_file, cp_temp_file2)

In [None]:
join_files(cp_temp_file1, cp_temp_file2, cp_out_file)

In [None]:
NMT_preprocessing(nmt_english, nmt_temp1)
NMT_preprocessing(nmt_german, nmt_temp2)
join_files(nmt_temp1, nmt_temp2, nmt_out)

In [None]:
# vocab = create_vocab([nli_fi?le1, nli_file2, nli_file3], vocab_size) #list of input files, size of vocab
vocab = create_vocab([out_file, cp_out_file, nmt_temp1], vocab_size) #list of input files, size of vocab

In [None]:
join_files(nmt_temp1, nmt_temp2, nmt_out)

In [None]:
write_vocab_file(vocab_file,vocab)

In [None]:
vocab_nmt = create_vocab([nmt_temp2], vocab_size)
write_vocab_file('../data/german_vocab.txt', vocab_nmt)

In [None]:
vocab_cp = create_vocab([cp_temp_file2], vocab_size) #list of input files, size of vocab
write_vocab_file(cp_vocab_file,vocab_cp)