# Convert xml to raw text files, parallel

In [None]:
import os
from tqdm import tqdm
import xml.etree.ElementTree as ET
from collections import defaultdict

word_align = ET.parse('./xces/en-vi.xml')
word_align_root = word_align.getroot()

### Extract all gzip files
~~~ bash
for file in ./*/*/*/*.gz; do gzip -dk $file; done
~~~

In [None]:
def load_words(doc_path):
    res = defaultdict(list)
    with open(os.path.join('./xces', doc_path[:-3]), 'r') as f:
        from_doc = ET.fromstring(f.read())
        for sentence in from_doc.findall('s'):
            res[sentence.attrib['id']] = [word.text.lower() for word in sentence.findall('w')]
    return res

In [None]:
fout_en = open('./xces/en.txt', 'w')
fout_vi = open('./xces/vi.txt', 'w')

for film in tqdm(word_align_root):
    src_doc_sentences = load_words(film.attrib['fromDoc'])
    dst_doc_sentences = load_words(film.attrib['toDoc'])

    for pair in film:
        dw_id = pair.attrib['xtargets'].split(';')
        if len(dw_id) != 2 or dw_id[0] == '' or dw_id[1] == '':
            continue

        src_sen = ''
        dst_sen = ''

        for i in dw_id[0].split():
            src_sen += ' ' + ' '.join(src_doc_sentences[i])
        for i in dw_id[1].split():
            dst_sen += ' ' + ' '.join(dst_doc_sentences[i])
        
        src_sen = src_sen.strip()
        dst_sen = dst_sen.strip()
        
        if len(src_sen) < 1 or len(dst_sen) < 1:
            continue
        
        fout_en.write(src_sen + '\n')
        fout_vi.write(dst_sen + '\n')

fout_en.close()
fout_vi.close()

### Use [vnTokenizer](http://mim.hus.vnu.edu.vn/phuonglh/tools/vn.hus.nlp.tokenizer-4.1.1-bin.tar.gz) to tokenize vi.txt to vi.token.txt
~~~ bash
cd ./vn.hus.nlp.tokenizer-4.1.1-bin
bash vnTokenizer.sh -i ../xces/vi.txt -o ../xces/vi.token.txt
~~~

### Merge two raw text files to fast_align format

In [None]:
fin_en = open('./xces/en.txt', 'r')
fin_vi = open('./xces/vi.txt', 'r')
fout = open('./xces/en-vi-2.txt', 'w')

In [None]:
for (line_en, line_vi) in zip(fin_en, fin_vi):
    fout.write(line_en.strip() + ' ||| ' + line_vi.strip() + '\n')

fout.close()
fin_en.close()
fin_vi.close()

# Run [fast_align](https://github.com/clab/fast_align) for word alignment
~~~ bash
git clone git@github.com:clab/fast_align.git
sudo apt-get install libgoogle-perftools-dev libsparsehash-dev
cd fast_align
mkdir build
cd build
cmake ..
make
./fast_align -i ../../xces/en-vi.txt -d -o -v > ../../xces/en-vi.align
~~~

~~~ raw
  log_e likelihood: -7.82932e+07
  log_2 likelihood: -1.12953e+08
     cross entropy: 6.2075
        perplexity: 73.9001
      posterior p0: 0
 posterior al-feat: 0
       size counts: 3945
~~~

# Main
And here comes the main part

In [1]:
from tqdm import tqdm
from IPython.display import Markdown, display
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
import sys

In [2]:
lines = [None, None]

with open('./xces/en.txt', 'r') as f:
    lines[0] = [line.strip().split() for line in f.readlines()]

with open('./xces/vi.token.txt', 'r') as f:
    lines[1] = [line.strip().split() for line in f.readlines()]

align = []
with open('./xces/en-vi.align', 'r') as f:
    for line in f.readlines():
        tmp = []
        for pair in line.strip().split():
            tmp.append([int(id) for id in pair.split('-')])
        align.append(tmp)

In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
def word_matched(src_word, target_word, lang, pos):
#     return src_word == target_word
    if lang == 0:
        return src_word == lemmatizer.lemmatize(target_word, pos)
    else:
        return src_word == target_word

def words_in_line(word, line, lang, pos):
    return [idx for idx, w in enumerate(line) if word_matched(word, w, lang, pos)]

def find(word, lang, pos='n'):
    word = word.replace(' ', '_')
    res = []

    for idx, line in tqdm(enumerate(lines[lang])):
        word_ids = words_in_line(word, line, lang, pos)
        if word_ids and len(word_ids) > 1:
            for word_id in word_ids:
                tmp = [pair[1-lang] for pair in align[idx] if pair[lang] == word_id]
                if len(tmp) > 0:
                    res += [(idx, word_id, tmp)]
    
    return res

In [5]:
def print_lines(res_indexes, lang, max_lines = -1):
    display(Markdown('---'))
    cnt = 0

    for res_id in res_indexes:
        src_sen = ''
        dst_sen = ''
        
        for idx, w in enumerate(lines[lang][res_id[0]]):
            if idx == res_id[1]:
                src_sen += ' **' + w + '** '
            else:
                src_sen += ' ' + w + ' '

        for idx, w in enumerate(lines[1-lang][res_id[0]]):
            if idx in res_id[2]:
                dst_sen += ' **' + w + '** '
            else:
                dst_sen += ' ' + w + ' '
        
        display(Markdown(src_sen))
        display(Markdown(dst_sen))
        display(Markdown('---'))
        
        if max_lines > 0 and max_lines >= cnt:
            break

def print_freq(res_indexes, lang):
    res = defaultdict(int)
    for res_id in res_indexes:
        try:
            tmp_word = ' '.join([lines[1-lang][res_id[0]][idx] for idx in res_id[2]])
            res[tmp_word] += 1
        except:
            print("Unexpected error:", sys.exc_info()[0])
    print(sorted(list(res.items()), key=lambda x: x[1], reverse=True))

In [8]:
t = find('bake', 0, 'v')

2253696it [01:26, 26066.87it/s]


In [9]:
print_freq(t, 0)
print_lines(t, 0)

[('nướng', 3), ('đậu_trắng sốt cà', 1), ('đậu_trắng sốt', 1), ('cà', 1)]


---

 -  um  ,  baking-  -  baking  soda  .  not  **baking**  powder  . 

 sô-đa  chứ  không  phải  là  **nướng**  bột  . 

---

 **baked**  potato  with  cheese  and  baked  beans  . 

 khoai_tây  **nướng**  phô_mai  và  đậu  . 

---

 your  choices  are  **baked**  potato  ,  baked  beans  ... 

 anh  chọn  khoai_tây  **nướng**  ,  đậu_trắng  sốt  cà  ... 

---

 your  choices  are  baked  potato  ,  **baked**  beans  ... 

 anh  chọn  khoai_tây  nướng  ,  **đậu_trắng**  **sốt**  **cà**  ... 

---

 -  **baked**  beans  .  the  baked  beans  . 

 **đậu_trắng**  **sốt**  cà  . 

---

 -  baked  beans  .  the  **baked**  beans  . 

 đậu_trắng  sốt  **cà**  . 

---

In [None]:
print(t)