Цель исследования - попробовать написать модель машинного перевода, познакомиться с инструментами машинного перевода

In [1]:
# импорты
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.translate import AlignedSent, Alignment
from nltk.translate.phrase_based import phrase_extraction
import spacy

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Создание своей модели

In [30]:
# прочитаем первый словарь
with open('minidict.csv', newline='') as csvfile:
  reader = csv.DictReader(csvfile)
  words = dict()
  for row in reader:
    words[row ['eng']] = row['ru']

Сначала попробуем создать модель, которая переводит слова

In [None]:
# перевод слов
class wordmodel:
    def __init__(self): 
        print("I am word translator")
        
    def fit(self, words):
        # запомним соответствия по словам
        self.w = words
    
    def predict(self, word):
      # найдем слово в списке
        if word in self.w.keys():
          return self.w[word]
        else:
          # возможно, есть однокоренные слова
          # поищем строку
          for elem in self.w:
            if word in elem:
              return self.w[elem] + '-some_gloss' 
          else:
            return word
            print('Непереводимое слово')

In [None]:
w1 = wordmodel()
w1.fit(words)
w1.predict('cat')
w1.predict('ca')

I am word translator


'кошка-some_gloss'

Следующий шаг - модель, которая может работать с несколькими словами

In [None]:
# модель с токенизацией и лемматизацией
class sentmodel:
    def __init__(self): 
        print("I am machine translator")
        
    def fit(self, words):
        # запомним соответствия по словам
        self.w = words
        
    def predict(self, string):
        # токенизируем строку
        ws = [w.lower() for w in word_tokenize(string) if w.isalpha()]
        wnl = WordNetLemmatizer()
        trans = [] # массив для перевода
        for tok in ws:
          # лемматизируем каждый токен
          lemma = wnl.lemmatize(tok)
          # найдем слово в словаре
          if lemma in self.w.keys():
            trans.append(self.w[lemma])
          else:
            # возможно, есть однокоренные слова
            # поищем строку
            for elem in self.w:
              if lemma in elem:
                trans.append(self.w[elem] + '-some_gloss')
            else:
              trans.append('???')
        answ = ''
        for elem in trans:
            answ += elem + ' '
        return(answ)

In [None]:
w2 = sentmodel()
w2.fit(words)
w2.predict('cat eats dog')

I am machine translator


'кошка ??? собака '

Проблемы данного подхода:
1. Разбор слова - лемматайзер не находит лемму без указания части речи
2. Обработка перевода - необходимо как-то учесть морфосинтаксис целевого языка
3. Ограниченность словарем

In [31]:
class syntmodel:
  def __init__(self): 
        print("I am a smarter machine translator")
        
  def fit(self, words):
    # запомним соответствия по словам
    self.w = words

  def predict(self, string):
    nlp = spacy.load("en_core_web_sm")
    wnl = WordNetLemmatizer()
    d = nlp(string)
    for token in d:
      trans = []
      dependencies = []
      # найдем слово в словаре
      if token.lemma_ in self.w.keys():
        trans.append(self.w[token.lemma_])
        dependencies.append(token.dep_)
      else:
        # возможно, есть однокоренные слова
        # поищем строку
        for elem in self.w:
          if token.lemma_ in elem:
            trans.append(self.w[elem] + '-some_gloss')
            dependencies.append(token.dep_)
          else:
            trans.append('???')
            dependencies.append('???')
        answ = ''
        for elem in trans:
            answ += elem + ' '
        return(answ)

Проблемы данного подхода:
1. Обработка перевода - необходимо как-то учесть морфосинтаксис целевого языка; для синтаксического парсинга необходимо точно быть уверенным в ветвлении языка
2. Ограниченность словарем

Как решаются эти проблемы?
1. Alignment
2. Обучение на корпусных данных вместо использования словарей

# Обучение существующей модели
на материале Библии

In [2]:
! git clone https://github.com/clab/fast_align.git

Cloning into 'fast_align'...
remote: Enumerating objects: 213, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 213 (delta 2), reused 4 (delta 2), pack-reused 204[K
Receiving objects: 100% (213/213), 70.68 KiB | 5.89 MiB/s, done.
Resolving deltas: 100% (110/110), done.


In [3]:
! mkdir /content/fast_align/build 
# создаем папку для сборки
%cd fast_align/build 
# переходим в папку
# ! pwd
! cmake .. # собираем
! make # собираем
%cd /content/ 
# возвращаемся в домашнюю папку
# ! pwd

/content/fast_align/build
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Could NOT find SparseHash (missing: SPARSEHASH_INCLUDE_DIR) 
-- Configuring done
-- Generating done
-- Build files have been written to: /content/fast_align/build
[35m[1mScanning dependencies of target atools[0m
[ 16%] [32mBuilding CXX object CMakeFiles/atools.dir/src/alignment_io.cc.o[0m
[ 33%] [32mBuilding CXX object CMakeFiles/atools.dir/src/atools.cc.o[0m
[ 50%] 

In [6]:
! wget https://opus.nlpl.eu/download.php?f=bible-uedin/v1/moses/en-ru.txt.zip

--2022-03-30 20:26:59--  https://opus.nlpl.eu/download.php?f=bible-uedin/v1/moses/en-ru.txt.zip
Resolving opus.nlpl.eu (opus.nlpl.eu)... 193.166.25.9
Connecting to opus.nlpl.eu (opus.nlpl.eu)|193.166.25.9|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/en-ru.txt.zip [following]
--2022-03-30 20:27:00--  https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/en-ru.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5809916 (5.5M) [application/zip]
Saving to: ‘download.php?f=bible-uedin%2Fv1%2Fmoses%2Fen-ru.txt.zip.1’


2022-03-30 20:27:02 (5.19 MB/s) - ‘download.php?f=bible-uedin%2Fv1%2Fmoses%2Fen-ru.txt.zip.1’ saved [5809916/5809916]



In [10]:
! unzip download.php?f=bible-uedin%2Fv1%2Fmoses%2Fen-ru.txt.zip

Archive:  download.php?f=bible-uedin%2Fv1%2Fmoses%2Fen-ru.txt.zip
  inflating: README                  
  inflating: LICENSE                 
  inflating: bible-uedin.en-ru.en    
  inflating: bible-uedin.en-ru.ru    
  inflating: bible-uedin.en-ru.xml   


In [12]:
! paste bible-uedin.en-ru.en bible-uedin.en-ru.ru  -d "\t" | sed 's/\t/ ||| /' > bible.en-ru

In [13]:
! head bible.en-ru

In the beginning God created the heavens and the earth. ||| В начале сотворил Бог небо и землю.
Now the earth was formless and empty. Darkness was on the surface of the deep. God's Spirit was hovering over the surface of the waters. ||| Земля же была безвидна и пуста, и тьма над бездною, и Дух Божий носился над водою.
God said, "Let there be light," and there was light. ||| И сказал Бог: да будет свет. И сталсвет.
God saw the light, and saw that it was good. God divided the light from the darkness. ||| И увидел Бог свет, что он хорош, и отделил Бог свет от тьмы.
God called the light "day," and the darkness he called "night." There was evening and there was morning, one day. ||| И назвал Бог свет днем, а тьму ночью. И был вечер, и было утро: день один.
God said, "Let there be an expanse in the middle of the waters, and let it divide the waters from the waters." ||| И сказал Бог: да будет твердь посреди воды, и да отделяет она воду от воды.
God made the expanse, and divided the waters wh

In [14]:
! mv ./fast_align ./fast_align_git

In [15]:
! cp ./fast_align_git/build/fast_align ./fast_align
! cp ./fast_align_git/build/atools ./atools

In [25]:
! ./fast_align -i bible.en-ru -d -o -v > forward.align

ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
.................................................. [50000]
............
expected target length = source length * 0.742372
ITERATION 1
.................................................. [50000]
............
  log_e likelihood: -2.34116e+07
  log_2 likelihood: -3.37758e+07
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.16928
       size counts: 1424
ITERATION 2
.................................................. [50000]
............
  log_e likelihood: -8.04576e+06
  log_2 likelihood: -1.16076e+07
     cross entropy: 10.2747
        perplexity: 1238.78
      posterior p0: 0.044045
 posterior al-feat: -0.115135
       size counts: 1424
  1  model al-feat: -0.237202 (tension=4)
  2  model al-feat: -0.171761 (tension=6.44134)
  3  model al-feat: -0.151355 (tension=7.57387)
  4  model al-feat: -0.140536 (tension=8.29828)
  5  model al-feat: -0.13379 (tension=8.8063)
  6  model al-feat: -0.129221 (tens

In [26]:
! ./fast_align -i bible.en-ru -d -o -v -r > reverse.align

ARG=i
ARG=d
ARG=o
ARG=v
ARG=r
INITIAL PASS 
.................................................. [50000]
............
expected target length = source length * 1.39587
ITERATION 1
.................................................. [50000]
............
  log_e likelihood: -3.213e+07
  log_2 likelihood: -4.63538e+07
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.167123
       size counts: 1424
ITERATION 2
.................................................. [50000]
............
  log_e likelihood: -8.57717e+06
  log_2 likelihood: -1.23742e+07
     cross entropy: 7.98116
        perplexity: 252.679
      posterior p0: 0.054358
 posterior al-feat: -0.117062
       size counts: 1424
  1  model al-feat: -0.121194 (tension=4)
  2  model al-feat: -0.119894 (tension=4.08264)
  3  model al-feat: -0.119014 (tension=4.13928)
  4  model al-feat: -0.118412 (tension=4.17831)
  5  model al-feat: -0.117998 (tension=4.2053)
  6  model al-feat: -0.117712 

In [27]:
! ./atools -i forward.align -j reverse.align -c grow-diag-final-and > result.align

In [28]:
! head result.align

0-0 1-1 2-1 3-3 4-2 5-1 6-4 7-5 8-6 9-6
0-1 1-1 2-0 3-2 4-3 5-4 6-3 7-3 8-5 9-8 10-6 13-9 14-9 15-12 16-11 18-13 19-14 24-15
0-2 1-0 1-1 2-2 4-3 5-4 5-5 7-7 8-6 9-7
0-2 1-0 1-1 3-3 4-7 6-4 7-5 7-6 8-6 9-6 10-9 11-8 12-10 13-10 14-11 15-11 16-12
0-2 1-0 1-1 2-3 3-3 3-4 4-3 5-5 6-5 7-6 8-5 10-7 11-8 12-9 13-10 14-11 15-12 16-12 17-13 19-14
0-2 1-1 2-0 2-3 3-4 4-4 6-5 7-6 9-6 10-6 11-7 12-7 13-8 14-9 15-11 16-10 18-12 19-13 20-13 21-14
0-2 1-0 1-1 3-3 4-4 5-3 5-5 6-4 7-6 8-7 9-8 10-8 11-7 12-8 13-9 14-9 15-8 15-10 16-11 18-12 19-12 20-13 22-16 23-15 24-14 24-16
0-2 1-0 1-1 3-3 4-3 4-4 5-5 6-6 7-7 8-8 9-9 10-9 11-10 12-10 13-12 14-11 14-12
0-2 1-1 2-0 2-3 4-5 5-7 6-6 7-7 8-3 9-4 10-8 11-9 12-10 13-11 14-12 15-13 17-15 18-15 19-14 19-15 21-18 22-17 23-16 23-18
0-2 1-0 1-1 3-3 4-4 5-3 6-5 7-5 8-6 12-7 13-8 14-8 14-10 15-9 16-12 17-11 18-13 19-14 20-15 21-15


In [22]:
# результат - соответствие позиций слов двух языков для каждого предложения

# Некоторые идеи дальнейшего развития

In [None]:
import sqlite3

In [None]:
# для пословного перевода - создать базу данных-словарь
conn = sqlite3.connect('wordbase.db')
cur = conn.cursor()

In [None]:
cur.execute("""
CREATE TABLE words (
    word_id INT,
    eng_word TEXT,
    rus_word TEXT,
    eng_pos TEXT,
    rus_pos TEXT,
    PRIMARY KEY (word_id)) """)
conn.commit()

In [None]:
# принцип расширения - для нового языка своя таблица с частью речи
cur.execute("""
CREATE TABLE words_fr (
    word_id
    fr_word TEXT,
    pos TEXT,
    PRIMARY KEY (word_id)) """)
conn.commit()

In [None]:
wds = ('1', 'cat', 'кошка', 'n', 'n',)
cur.execute("INSERT INTO words VALUES (?, ?, ?, ?, ?)", wds)
conn.commit()

In [None]:
# вместо синтаксического парсинга использовать инструмент Alignment (например, из nltk) при обучении модели на корпусных данных
algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'], ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1'))
srctext = "michael assumes that he will stay in the house"
trgtext = "michael geht davon aus , dass er im haus bleibt"
alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), (5,9), (6,7), (7,7), (8,8)]
phrases = phrase_extraction(srctext, trgtext, alignment)
for i in sorted(phrases):
    print(i)

((0, 1), (0, 1), 'michael', 'michael')
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus ,')
((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
((1, 2), (1, 4), 'assumes', 'geht davon aus')
((1, 2), (1, 4), 'assumes', 'geht davon aus ,')
((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
((2, 3), (5, 6), 'that', ', dass')
((2, 3), (5, 6), 'that', 'dass')
((2, 4), (5, 7), 'that he', ', dass er')
((2, 4), (5, 7), 'that he', 'dass er')
((2, 9), (5, 10), 'that he will stay in the house', ', dass er im haus bleibt')
((2, 9), (5, 10