Skip to content

Commit

Permalink
Merge pull request #14 from luismond/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
luismond authored Jan 25, 2022
2 parents 0eda05e + db9e002 commit 0ad5dc2
Show file tree
Hide file tree
Showing 17 changed files with 995 additions and 650 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# TM2TB

**tm2tb** is a term / keyword / n-gram extraction module with a focus on bilingual data. It leverages spaCy's part-of-speech tags and LaBSE's sentence embeddings to extract and align terms from pairs of sentences and bilingual documents such as translation files.
**tm2tb** is a term / keyword / n-gram extraction module with a focus on bilingual data. It leverages spaCy's part-of-speech tags and multilingual sentence transformer models to extract and align terms from pairs of sentences and bilingual documents such as translation files.

## Approach

Expand Down
157 changes: 157 additions & 0 deletions tests/panda_text_english.txt

Large diffs are not rendered by default.

185 changes: 185 additions & 0 deletions tests/panda_text_spanish.txt

Large diffs are not rendered by default.

54 changes: 35 additions & 19 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
tm2tb test examples
"""
from tm2tb import Tm2Tb
from pprint import pprint
term_model = Tm2Tb()

tt = Tm2Tb()

# Extracting terms from a sentence in English
src_sentence = """
#%% Extract terms from a sentence in English
en_sentence = """
The giant panda, also known as the panda bear (or simply the panda),
is a bear native to South Central China. It is characterised
by its bold black-and-white coat and rotund body. The name "giant panda"
Expand All @@ -18,10 +16,11 @@
or even meat in the form of birds, rodents, or carrion.
In captivity, they may receive honey, eggs, fish, shrub leaves, oranges, or bananas.
"""
pprint(tt.get_ngrams(src_sentence))
en_sentence_terms = term_model.get_terms_from_sentence(en_sentence)
print(en_sentence_terms[:10])

# Extracting terms from a sentence in Spanish
trg_sentence = """
# Extract terms from a sentence in Spanish
es_sentence = """
El panda gigante, también conocido como oso panda (o simplemente panda),
es un oso originario del centro-sur de China. Se caracteriza por su llamativo
pelaje blanco y negro, y su cuerpo rotundo. El nombre de "panda gigante"
Expand All @@ -33,17 +32,34 @@
En cautividad, pueden alimentarse de miel, huevos, pescado, hojas de arbustos,
naranjas o plátanos.
"""
pprint(tt.get_ngrams(trg_sentence))

# Extracting and matching terms from both sentences
pprint(tt.get_ngrams((src_sentence, trg_sentence)))
es_sentence_terms = term_model.get_terms_from_sentence(es_sentence)
print(es_sentence_terms[:10])


# Extract and align terms from both sentences
bilingual_terms = term_model.get_terms_from_bisentence((en_sentence, es_sentence))
bilingual_terms = bilingual_terms.drop(columns=['src_ngram_rank',
'src_ngram_tags',
'trg_ngram_rank',
'trg_ngram_tags',
'bi_ngram_rank'])
print(bilingual_terms[:10])


# Extracting terms from a bilingual document
file_path = 'tests/panda_bear_english_spanish.csv'
bitext = tt.read_bitext(file_path)
pprint(tt.get_ngrams(bitext))

# Using arguments
pprint(tt.get_ngrams(src_sentence, diversity=.1))
pprint(tt.get_ngrams((src_sentence, trg_sentence), include_pos=['ADJ']))
pprint(tt.get_ngrams(bitext, ngrams_min=2, ngrams_max=4))
bitext_path = 'tests/panda_bear_english_spanish.csv'
bitext_terms = term_model.get_terms_from_bitext(bitext_path)
bitext_terms = bitext_terms.drop(columns=['src_ngram_tags',
'trg_ngram_tags',])
print(bitext_terms[:10])

# Extract terms from a text
en_text_path = 'tests/panda_text_english.txt'
en_text_terms = term_model.get_terms_from_text(en_text_path)
print(en_text_terms[:10])

# Extract terms from a text
es_text_path = 'tests/panda_text_spanish.txt'
es_text_terms = term_model.get_terms_from_text(es_text_path)
print(es_text_terms[:10])
13 changes: 11 additions & 2 deletions tm2tb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
"""
TM2TB initialization
"""
__author__ = "Luis Mondragon (luismond@gmail.com)"
__version__ = '1.0.3'
__version__ = '1.4.0'

from tm2tb.transformer_model import TransformerModel
trf_model = TransformerModel().load()
from tm2tb.bitext_reader import BitextReader
from tm2tb.tm2tb import Tm2Tb, Sentence
from tm2tb.sentence import Sentence
from tm2tb.bisentence import BiSentence
from tm2tb.text import Text
from tm2tb.bitext import BiText
from tm2tb.tm2tb import Tm2Tb
87 changes: 87 additions & 0 deletions tm2tb/align_ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
align ngrams
"""
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
pd.options.mode.chained_assignment = None

def get_seq_similarities(src_embs, trg_embs):
seq_similarities = cosine_similarity(src_embs, trg_embs)
return seq_similarities

def get_aligned_ngrams(src_ngrams_df, trg_ngrams_df, **kwargs):
#src_ngrams_df, trg_ngrams_df = self.get_ngrams_dfs(**kwargs)
src_ngrams = src_ngrams_df['joined_ngrams'].tolist()
src_tags = src_ngrams_df['tags'].tolist()
src_ranks = src_ngrams_df['rank'].tolist()
src_embeddings = src_ngrams_df['embedding'].tolist()
trg_ngrams = trg_ngrams_df['joined_ngrams'].tolist()
trg_tags = trg_ngrams_df['tags'].tolist()
trg_ranks = trg_ngrams_df['rank'].tolist()
trg_embeddings = trg_ngrams_df['embedding'].tolist()
seq_similarities = get_seq_similarities(src_embeddings, trg_embeddings)
src_idx = list(range(len(src_ngrams)))
trg_idx = list(range(len(trg_ngrams)))
# Get indexes and values of most similar source ngram for each target ngram
trg_max_values = np.max(seq_similarities[src_idx][:, trg_idx], axis=1)
trg_max_idx = np.argmax(seq_similarities[src_idx][:, trg_idx], axis=1)
# Get indexes and values of most similar target ngram for each source ngram
src_max_values = np.max(seq_similarities[src_idx][:, trg_idx], axis=0)
src_max_idx = np.argmax(seq_similarities[src_idx][:, trg_idx], axis=0)
src_aligned_ngrams = pd.DataFrame([(src_ngrams[idx],
src_tags[idx],
src_ranks[idx],
trg_ngrams[trg_max_idx[idx]],
trg_tags[trg_max_idx[idx]],
trg_ranks[trg_max_idx[idx]],
float(trg_max_values[idx])) for idx in src_idx])
trg_aligned_ngrams = pd.DataFrame([(src_ngrams[src_max_idx[idx]],
src_tags[src_max_idx[idx]],
src_ranks[src_max_idx[idx]],
trg_ngrams[idx],
trg_tags[idx],
trg_ranks[idx],
float(src_max_values[idx])) for idx in trg_idx])
return src_aligned_ngrams, trg_aligned_ngrams

def get_top_ngrams(src_ngrams_df,
trg_ngrams_df,
min_similarity=.8,
**kwargs):
# Concatenate source & target ngram alignments
src_aligned_ngrams, trg_aligned_ngrams = get_aligned_ngrams(src_ngrams_df,
trg_ngrams_df,
**kwargs)
bi_ngrams = pd.concat([src_aligned_ngrams, trg_aligned_ngrams])
bi_ngrams = bi_ngrams.reset_index()
bi_ngrams = bi_ngrams.drop(columns=['index'])
bi_ngrams.columns = ['src_ngram',
'src_ngram_tags',
'src_ngram_rank',
'trg_ngram',
'trg_ngram_tags',
'trg_ngram_rank',
'bi_ngram_similarity']

# Keep n-grams above min_similarity
bi_ngrams = bi_ngrams[bi_ngrams['bi_ngram_similarity'] >= min_similarity]
if len(bi_ngrams)==0:
raise ValueError('No ngram pairs above minimum similarity!')
# For one-word terms, keep those longer than 1 character
bi_ngrams = bi_ngrams[bi_ngrams['src_ngram'].str.len()>1]
bi_ngrams = bi_ngrams[bi_ngrams['trg_ngram'].str.len()>1]
# Group by source, get the most similar target n-gram
bi_ngrams = pd.DataFrame([df.loc[df['bi_ngram_similarity'].idxmax()]
for (src_ngram, df) in list(bi_ngrams.groupby('src_ngram'))])
# Group by target, get the most similar source n-gram
bi_ngrams = pd.DataFrame([df.loc[df['bi_ngram_similarity'].idxmax()]
for (trg_ngram, df) in list(bi_ngrams.groupby('trg_ngram'))])
# Get bi n-gram rank
bi_ngrams['bi_ngram_rank'] = bi_ngrams['bi_ngram_similarity'] * \
bi_ngrams['src_ngram_rank'] * bi_ngrams['trg_ngram_rank']
bi_ngrams = bi_ngrams.sort_values(by='bi_ngram_rank', ascending=False)
bi_ngrams = bi_ngrams.round(4)
return bi_ngrams
25 changes: 25 additions & 0 deletions tm2tb/bisentence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
BiSentence class.
"""
from tm2tb import Sentence
from tm2tb.align_ngrams import get_top_ngrams

class BiSentence:
def __init__(self, sentence_tuple):
self.src_sentence = Sentence(sentence_tuple[0])
self.trg_sentence = Sentence(sentence_tuple[1])

def get_ngrams_dfs(self, **kwargs):
src_ngrams_df = self.src_sentence.get_top_ngrams(return_embs=True,
**kwargs)
trg_ngrams_df = self.trg_sentence.get_top_ngrams(return_embs=True,
**kwargs)
return src_ngrams_df, trg_ngrams_df

def get_top_ngrams(self, **kwargs):
src_ngrams_df, trg_ngrams_df = self.get_ngrams_dfs(**kwargs)
top_ngrams = get_top_ngrams(src_ngrams_df,
trg_ngrams_df,
min_similarity=.8,
**kwargs)
return top_ngrams
25 changes: 25 additions & 0 deletions tm2tb/bitext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
BiText class
"""
from tm2tb import Text
from tm2tb.align_ngrams import get_top_ngrams

class BiText:
def __init__(self, bitext):
self.src_text = Text(bitext['src'].tolist())
self.trg_text = Text(bitext['trg'].tolist())

def get_ngrams_dfs(self, **kwargs):
src_ngrams_df = self.src_text.get_top_ngrams(return_embs=True,
**kwargs)
trg_ngrams_df = self.trg_text.get_top_ngrams(return_embs=True,
**kwargs)
return src_ngrams_df, trg_ngrams_df

def get_top_ngrams(self, **kwargs):
src_ngrams_df, trg_ngrams_df = self.get_ngrams_dfs(**kwargs)
top_ngrams = get_top_ngrams(src_ngrams_df,
trg_ngrams_df,
min_similarity=.8,
**kwargs)
return top_ngrams
Loading

0 comments on commit 0ad5dc2

Please sign in to comment.