In [2]:
import json

f = open('rpd_data.json')
rpd_data = json.load(f)

f.close()

In [18]:
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np

# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])


In [19]:
# Load pipeline
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)


# Argos Translate

## https://github.com/argosopentech/argos-translate

In [2]:
#!pip install argostranslate

import argostranslate.package
import argostranslate.translate

from_code = "ru"
to_code = "en"

# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())



In [3]:
translated_texts = []

In [4]:
#for i in rpd_data[:20]:
#    translated_texts.append(argostranslate.translate.translate(i['text'], from_code, to_code))


In [5]:
#import json

#with open('translate20.json', 'w') as outfile:
#    outfile.write(json.dumps(translated_texts, ensure_ascii=False, indent=4))

In [6]:
import json

f = open('translate20.json')
translated_texts = json.load(f)

f.close()

In [7]:
translated_texts[10]

"The course is devoted to the automatic processing and analysis of video and images in intellectual video computer medical systems. The focus is on the digitalization techniques and algorithms of medical images and video data, video analysis techniques, machine training and data analysis to address detection, segmentation, classification of objects of interest. In-depth learning methods, as well as smart-technologies for visualization and video imagery are being studied (storing panorama images, visualizing images in narrow spectral ranges, synthesis of images, taking into account the personal characteristics of users, etc.). The course examines the structure and principles of the actual medical video computer systems, as well as the features of video-data processing. The purpose of the discipline is to examine the principles, methods of processing and analysis of medical images and to develop the skills and capabilities of the medical video analysis. A full range of advanced computer-

### kbir-inspec

In [10]:
keyphrases_en = []

In [11]:
for i in translated_texts:
    keyphrases_en.append(extractor(i))
    

#keyphrases = extractor(translatedText)
#keyphrases

In [12]:
keyphrases_en

[array(['Automated Design Systems', 'Moodle monitoring', 'Ownership',
        'SARP', 'SCADA', 'automated design systems', 'complex',
        'electrical policy framework', 'software tools'], dtype='<U27'),
 array(['ARR', 'AutoCAD', 'OLE technology',
        'SAP Electrotechnical Opportunity', 'SAPR AutoCAD',
        'SAPRO AutoCAD', 'SARP', 'actuator',
        'automated geometric modelling', 'complex mechanized modules',
        'composite documents', 'differential subtraction', 'documentation',
        'electrical appliances', 'electrical engineering',
        'element management system', 'geometric modelling technology',
        'individual household tasks', 'mechantronized modules',
        'product life cycle', 'project documentation',
        'reference dimensions', 'robots', 'textual documentation'],
       dtype='<U32'),
 array(['OMTU course', 'Silvester criterion', 'asymptotic sustainability',
        'automatic management', 'dynamic systems', 'harthon methods',
        'line

In [13]:
keyphrases_ru = []

In [14]:
import argostranslate.package
import argostranslate.translate

from_code = "en"
to_code = "ru"

# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())



In [15]:
for i in keyphrases_en:
    arr = []
    for j in i:
        arr.append(argostranslate.translate.translate(j, from_code, to_code))
    keyphrases_ru.append(arr)


In [16]:
keyphrases_ru

[['Автоматизированные системы проектирования',
  'Moodle monitoring',
  'Собственность',
  'SARP',
  'SCADA',
  'автоматизированные системы проектирования',
  'комплекс',
  'электрическая политика',
  'программные инструменты'],
 ['ARR',
  'AutoCAD',
  'Технология OLE',
  'SAP Электротехнические возможности',
  'SAPR AutoCAD',
  'SAPRO AutoCAD',
  'SARP',
  'actuator',
  'автоматизированное геометрическое моделирование',
  'сложные механизированные модули',
  'композитные документы',
  'дифференциальное вычитание',
  'документация',
  'электрические приборы',
  'электротехника',
  'система управления элементами',
  'технология геометрического моделирования',
  'индивидуальные домашние задания',
  'механизированные модули',
  'жизненный цикл продукта',
  'проектная документация',
  'габариты',
  'роботы',
  'текстовая документация'],
 ['Курс ОМТУ',
  'Критерий молчания',
  'асимптотическая устойчивость',
  'автоматическое управление',
  'динамические системы',
  'Методы гартона',
  'лин

In [17]:
import json

with open('keyph20ru.json', 'w') as outfile:
    outfile.write(json.dumps(keyphrases_ru, ensure_ascii=False, indent=4))

## Вывод: при переводе en-ru, кажется, потерялся смысл 
('uncertain'->'4.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1'?!?!)

# Marian Machine Translation

### https://skeptric.com/python-offline-translation/


In [1]:
from transformers import MarianMTModel, MarianTokenizer
from typing import Sequence


2023-01-24 16:23:14.032998: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-24 16:23:14.397024: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-24 16:23:14.504624: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-24 16:23:14.504656: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [3]:
import stanza
 
# First you will need to download the model
#stanza.download('ru')
nlp = stanza.Pipeline('ru', processors='tokenize')

for sentence in nlp.process(rpd_data[0]['text']).sentences:
    print(sentence.text)
    

2023-01-24 16:23:32 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

2023-01-24 16:23:32 INFO: Use device: cpu
2023-01-24 16:23:32 INFO: Loading: tokenize
2023-01-24 16:23:32 INFO: Done loading processors!


Дисциплина посвящена изучению вопросов, связанных с целостным представлением о системах автоматизированного проектирования: видах их обеспечения и развитии.
Последовательно излагаются вопросы: сущность процесса проектирования, методология системного подхода к проектированию, необходимость перехода к автоматизированному проектированию.
Этапы процесса проектирования представлены с точки зрения системного подхода к проектированию сложных систем.
Рассматриваются основные принципы построения и функции систем автоматизированного проектирования (САПР).
. Изучение основ САПР и их использования для проектирования средств и систем управления техническими и технологическими процессами.
Овладение: знаниями о методах и технологиях автоматизированного проектирования типовых систем автоматического управления техническими процессами; умениями формулировать цели и задачи проектирования и обоснованно выбирать программно-технические средства автоматизированного проектирования; навыками практического прим

In [4]:
from dataclasses import dataclass

@dataclass(frozen=True)
class SentenceBoundary:
    text: str
    prefix: str
        
    def __str__(self):
        return self.prefix + self.text

In [5]:
from __future__ import annotations # For Python 3.7
from typing import List

@dataclass(frozen=True)
class SentenceBoundaries:
    sentence_boundaries: List[SentenceBoundary]
        
    @classmethod
    def from_doc(cls, doc: stanza.Document) -> SentenceBoundaries:
        sentence_boundaries = []
        start_idx = 0
        for sent in doc.sentences:
            sentence_boundaries.append(SentenceBoundary(text=sent.text, prefix=doc.text[start_idx:sent.tokens[0].start_char]))
            start_idx = sent.tokens[-1].end_char
        sentence_boundaries.append(SentenceBoundary(text='', prefix=doc.text[start_idx:]))
        return cls(sentence_boundaries)
    
    @property
    def nonempty_sentences(self) -> List[str]:
        return [item.text for item in self.sentence_boundaries if item.text]
    
    def map(self, d: Dict[str, str]) -> SentenceBoundaries:
        return SentenceBoundaries([SentenceBoundary(text=d.get(sb.text, sb.text),
                                                    prefix=sb.prefix) for sb in self.sentence_boundaries])
    
    def __str__(self) -> str:
        return ''.join(map(str, self.sentence_boundaries))

In [8]:
def minibatch(seq, size):
    items = []
    for x in seq:
        items.append(x)
        if len(items) >= size:
            yield items
            items = []
    if items:
        yield items

class Translator:
    def __init__(self, source_lang: str, dest_lang: str, use_gpu: bool=False) -> None:
        self.use_gpu = use_gpu
        self.model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{dest_lang}'
        self.model = MarianMTModel.from_pretrained(self.model_name)
        if use_gpu:
            self.model = self.model.cuda()
        self.tokenizer = MarianTokenizer.from_pretrained(self.model_name)
        self.sentencizer = stanza.Pipeline(source_lang, processors='tokenize', verbose=False, use_gpu=use_gpu)
        
    def sentencize(self, texts: Sequence[str]) -> List[SentenceBoundaries]:
        return [SentenceBoundaries.from_doc(self.sentencizer.process(text)) for text in texts]
                
    def translate(self, texts: Sequence[str], batch_size:int=10, truncation=True) -> Sequence[str]:
        if isinstance(texts, str):
            raise ValueError('Expected a sequence of texts')
        text_sentences = self.sentencize(texts)
        translations = {sent: None for text in text_sentences for sent in text.nonempty_sentences}
    
        for text_batch in minibatch(sorted(translations, key=len, reverse=True), batch_size):
            tokens = self.tokenizer(text_batch, return_tensors="pt", padding=True, truncation=truncation)
            if self.use_gpu:
                tokens = {k:v.cuda() for k, v in tokens.items()}
            translate_tokens = self.model.generate(**tokens)
            translate_batch = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translate_tokens]
            for (text, translated) in zip(text_batch, translate_batch):
                translations[text] = translated
            
        return [str(text.map(translations)) for text in text_sentences]

In [36]:
marian_ru_en = Translator('ru', 'en')

In [14]:
translated_texts = marian_ru_en.translate([i['text'] for i in rpd_data[:10]])


In [35]:
import json

with open('marian_translate10.json', 'w') as outfile:
    outfile.write(json.dumps(translated_texts, ensure_ascii=False, indent=4))

In [21]:
keyphrases_en = []
for i in translated_texts:
    keyphrases_en.append(extractor(i))
    
keyphrases_en

[array(['Automated Design Systems', 'Moodle Reference', 'Ownership',
        'SCADA', 'automated design systems', 'computer execution',
        'industrial automation', 'printed circuit boards', 'software'],
       dtype='<U24'),
 array(['AutoCAD', 'AutoCAD Electric system',
        'automated geometric modelling', 'complex mechanized modules',
        'composite objects', 'electrical engineering',
        'geometric modelling technology', 'household tasks',
        'mechanical modules', 'modules', 'product life cycle', 'project',
        'robots', 'text documentation'], dtype='<U30'),
 array(['Harriton techniques', 'asymptotic stability', 'automatic control',
        'dynamic object management laws', 'dynamic systems', 'governance',
        'linear systems', 'methods', 'multiple variables', 'platitude',
        'roughness', 'speed gradient algorithm', 'stability',
        'uncertain systems', 'ustain', 'variable state'], dtype='<U30'),
 array(['MATLAB applications packages', 'best man

In [32]:
keyphrases_ru = []
#stanza.download('en')
marian_en_ru = Translator('en', 'ru')

for i in keyphrases_en:
    keyphrases_ru.append(marian_en_ru.translate(list(i)))


In [33]:
keyphrases_ru

[['Автоматизированные проектные системы',
  'Ссылка на мудель',
  'Собственность',
  'СПАДА',
  'автоматизированные системы проектирования',
  'компьютерное исполнение',
  'Промышленная автоматизация',
  'печатные платки',
  'программное обеспечение'],
 ['АвтоCAD',
  'АвтоCAD Электрическая система',
  'автоматизированное геометрическое моделирование',
  'Комплексные механизированные модули',
  'составные объекты',
  'электротехника',
  'Технология геометрического моделирования',
  'Домашние работы',
  'механические модули',
  'модули',
  'жизненного цикла продукта',
  'проект',
  'роботы',
  'текстовая документация'],
 ['Методы Харритона',
  'асимптотическая стабильность',
  'автоматический контроль',
  'Законы об управлении динамичными объектами',
  'динамических систем',
  'управление',
  'линейные системы',
  'методы',
  'множество переменных',
  'банальность',
  'грубость',
  'алгоритм градиента скорости',
  'стабильность',
  'неопределенные системы',
  'УСТРОЙСТВА',
  'переменное 

In [34]:
import json

with open('marian10ru.json', 'w') as outfile:
    outfile.write(json.dumps(keyphrases_ru, ensure_ascii=False, indent=4))

## Вывод: местами переводит то, что не нужно. 

СПАДА
АвтоCAD


#### Модель выделяет некоторые не нужные вещи
'банальность',
  'грубость',

# KeyPhraseTransformer

https://github.com/Shivanandroy/KeyPhraseTransformer

In [38]:
from keyphrasetransformer import KeyPhraseTransformer

kp = KeyPhraseTransformer()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting keyphrasetransformer
  Downloading keyphrasetransformer-0.0.2.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: keyphrasetransformer
  Building wheel for keyphrasetransformer (setup.py) ... [?25ldone
[?25h  Created wheel for keyphrasetransformer: filename=keyphrasetransformer-0.0.2-py3-none-any.whl size=4179 sha256=af4ee08f85394d1ae9274869d942f2c6fb850f7d9b2aee3b85e8cc7b9596e22e
  Stored in directory: /home/mary/.cache/pip/wheels/60/d7/a2/784cf59963f1753418512c27e88c6c00bb0a692a8d1dcd56a6
Successfully built keyphrasetransformer
Installing collected packages: keyphrasetransformer
Successfully installed keyphrasetransformer-0.0.2


[nltk_data] Downloading package punkt to /home/mary/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /home/mary/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

In [39]:
kp.get_key_phrases(rpd_data[0]['text'])

['маркера', 'autocad electrical']

In [41]:
kp.get_key_phrases(translated_texts[0])

['design systems',
 'support',
 'development',
 'automated design',
 'automated design systems',
 'apr',
 'system design',
 'design methods',
 'automated process management systems',
 'software tools',
 'design',
 'automation',
 'systems',
 'skills',
 'knowledge',
 'aprs',
 'design process',
 'scada systems',
 'software engineering',
 'industrial automation systems',
 'control devices',
 'control systems',
 'moodle',
 'reference work',
 'printed circuit boards',
 'design of printed boards',
 'markup language',
 'design object',
 'op',
 'automatic design system',
 'project objectives',
 'project procedures',
 'sapr',
 'spr',
 'cad/cam/cae',
 'technical support',
 'management tools and systems',
 'data-processing software',
 'project decisions archive',
 'technical equipment packages',
 'mathematical models',
 'hierarchical hierarchy',
 'design objects',
 'functional models',
 'project decision-making',
 'synthetic algorithms',
 'machine graphics',
 'software packages',
 'collective use'

In [42]:
import json

f = open('translate20.json')
translated_texts_1 = json.load(f)

f.close()

In [43]:
kp.get_key_phrases(translated_texts_1[0])

['automated design systems',
 'support and development',
 'holistic understanding',
 'technology management tools',
 'software engineering',
 'automated design',
 'project management',
 'sarp',
 'knowledge acquisition',
 'skills',
 'design',
 'software',
 'systems',
 'automation',
 'technology management',
 'control design',
 'industrial automation systems',
 'system design',
 'control systems',
 'electrical policy framework',
 'moodle',
 'monitoring',
 'design of printers',
 'marker',
 'sapr',
 'electrical equipment',
 'project challenges',
 'project procedures',
 'autocad',
 'cad',
 'cae',
 'sap subsystems',
 'technical support',
 'design (engineering)',
 'mathematical security',
 'data-processing software',
 'mathematical modeling',
 'hierarchical models',
 'management tools and systems',
 'design facilities',
 'project synthesis algorithms',
 'software systems',
 'general-purpose packages',
 'sapr application packages',
 'collective use dialogue systems',
 'computer graphics',
 'in

## Итог: Слишком много

# Keyphrase Extraction Model: distilbert-inspec

https://huggingface.co/ml6team/keyphrase-extraction-distilbert-inspec


In [44]:
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np

# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.FIRST,
        )
        return np.unique([result.get("word").strip() for result in results])


In [45]:
# Load pipeline
model_name = "ml6team/keyphrase-extraction-distilbert-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)


Downloading:   0%|          | 0.00/697 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/266M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [46]:
keyphrases_en1 = []
for i in translated_texts:
    keyphrases_en1.append(extractor(i))
    
keyphrases_en1

[array(['automated design systems', 'computer', 'design goals',
        'holistic vision', 'management', 'mathematical techniques',
        'printed circuit boards', 'printed circuit drawing', 'software',
        'software tools'], dtype='<U24'),
 array(['autocad electric', 'autocad sapr', 'automated design',
        'automated design systems', 'automated geometric modelling',
        'blocks', 'complex mechanized modules', 'composite objects',
        'connections', 'constructions', 'cools', 'couples )',
        'dedicated saps', 'design documentation', 'diagrams', 'editing',
        'electric control system', 'electrical assembly panels',
        'electrical design tools', 'electrical devices',
        'electrical engineering project', 'electrical spr functionality',
        'geometric objects', 'household tasks', 'lines',
        'mechanized modules', 'mechatron modules', 'primitives',
        'principle', 'product life cycle', 'project documents',
        'reference dimensions', 'r

In [51]:
for i in range(len(keyphrases_en1)):
    print(i, "\n", keyphrases_en1[i], '\n')


0 
 ['automated design systems' 'computer' 'design goals' 'holistic vision'
 'management' 'mathematical techniques' 'printed circuit boards'
 'printed circuit drawing' 'software' 'software tools'] 

1 
 ['autocad electric' 'autocad sapr' 'automated design'
 'automated design systems' 'automated geometric modelling' 'blocks'
 'complex mechanized modules' 'composite objects' 'connections'
 'constructions' 'cools' 'couples )' 'dedicated saps'
 'design documentation' 'diagrams' 'editing' 'electric control system'
 'electrical assembly panels' 'electrical design tools'
 'electrical devices' 'electrical engineering project'
 'electrical spr functionality' 'geometric objects' 'household tasks'
 'lines' 'mechanized modules' 'mechatron modules' 'primitives' 'principle'
 'product life cycle' 'project documents' 'reference dimensions' 'robots'
 'sapr' 'saprs' 'schematic part' 'sloppy architecture'
 'systemic properties' 'systems' 'text documentation' 'volume models'] 

2 
 ['asymptotic' 'automati

In [50]:
for i in range(len(keyphrases_en)):
    print(i, "\n", keyphrases_en[i], '\n')


0 
 ['Automated Design Systems' 'Moodle Reference' 'Ownership' 'SCADA'
 'automated design systems' 'computer execution' 'industrial automation'
 'printed circuit boards' 'software'] 

1 
 ['AutoCAD' 'AutoCAD Electric system' 'automated geometric modelling'
 'complex mechanized modules' 'composite objects' 'electrical engineering'
 'geometric modelling technology' 'household tasks' 'mechanical modules'
 'modules' 'product life cycle' 'project' 'robots' 'text documentation'] 

2 
 ['Harriton techniques' 'asymptotic stability' 'automatic control'
 'dynamic object management laws' 'dynamic systems' 'governance'
 'linear systems' 'methods' 'multiple variables' 'platitude' 'roughness'
 'speed gradient algorithm' 'stability' 'uncertain systems' 'ustain'
 'variable state'] 

3 
 ['MATLAB applications packages' 'best management theory'
 'consistent symlex method' 'methodological tools'
 'non-linear automatic systems' 'optimization' 'skills'] 

4 
 ['Training' 'executive system' 'feedback' 'forc

## Все еще много лишнего