<h2>TODO</h2>

In [1]:
import pandas as pd

In [262]:
import re
import random
# itertools — Functions creating iterators for efficient looping
# The module standardizes a core set of fast, memory efficient tools that are useful by themselves or in combination. 
from itertools import product,combinations

class Directionality_Augmentation():
    """[Directionality augmentation of a phrase for the following pattern 
    [agent -> verb -> receiver] and [agent -> pronoun -> verb]]
    """    
    
    # The name of this pipeline element.
    name = 'directionality'
    
    _agents_dict = {
        '1S': ['EU'],
        '2S': ['TU'], 
        '3S': ['ELE', 'ELA'],
        '1P': ['NÓS'],
        '2P': ['VÓS'],
        '3P': ['ELES', 'ELAS']
    }
    _pronouns_dict = {
        '1S': ['ME'],
        '2S': ['TE','LHE'],
        '1P': ['NOS'],
        '2P': ['VOS'],
        '3P': ['LHES']
    }
    _valid_chars ='A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇa-záéíóúàâêôãõüç_'
    _high_valid_chars = '['+_valid_chars+']+'
    
    #GI pattern filter 
    _gi_verb_pattern = '[1-3][SP]_('+_high_valid_chars+')_[1-3][SP]'
    _gi_pattern = '[1-3][SP]_'+_high_valid_chars+'_[1-3][SP]'

    # pronome reto
    _agent_pattern = r'\b(EU|TU|ELE|ELA|NÓS|VÓS|ELES|ELAS)\b'

    # verbs
    _verb_pattern = r'\b({0}R|DO_QUE)\b'.format(_high_valid_chars)
    
    # pronome obliquo atono
    _pronoun_pattern = r'\b(ME|TE|LHE|VOS|NOS|LHES)\b'

    # both patterns -> [pronome reto, verbo, pronome reto] 
    # e [pronome reto, pronome obliquo atono, verbo]
    _pattern_agent_verb = "%s %s %s" %(_agent_pattern, _verb_pattern, _agent_pattern)
    _pattern_pronoun_verb = "%s %s %s" %(_agent_pattern, _pronoun_pattern, _verb_pattern)
    
    def __init__(self, *args, **kwargs):
		#super.__init__(*args, **kwargs)
        pass
     
    #! Métodos necessários para o step de augmentation
    def find_pattern(self,phrase_gr_gi):
        """[Find pattern in phrase, return 2 lists with each pattern]
        
        Arguments:
            phrase_gr {string} -- [gr type phrase]
        
        Returns:
            [type] -- [2 lists with patterns found or 2 empty lists if 
                       no pattern is encountered]
        """ 
        gr = phrase_gr_gi[0]
        gi = phrase_gr_gi[1]
        search_pattern_agent_verb, search_pattern_gi = None,None
        try:     
            # regex result for [pronome reto, verbo, pronome reto]
            # regex result for [pronome reto, pronome obliquo atono, verbo]
            search_pattern_agent_verb = re.findall(self._pattern_agent_verb+'|'+self._pattern_pronoun_verb, gr)
            search_pattern_gi = re.findall(self._gi_pattern,gi)
                
        except Exception as e:
            print('find_pattern:',e)
            print(phrase_gr_gi)
        finally:
            return search_pattern_agent_verb, search_pattern_gi
            
    
    def new_patterns(self, pattern_gr, verb_gi, **kwargs):
        '''creates new phrases from the following patterns 
       [pronome reto, verbo, pronome reto] ou [pronome reto, pronome obliquo atono, verbo]'''
        patterns_gr = []
        patterns_gi = []
        
        # pattern_gr[-1]  == '' -> [pronome reto, verbo, pronome reto] 
        # or pattern_gr[0] == '' -> [pronome reto, pronome obliquo atono, verbo]
               
        # [pronome reto, verbo, pronome reto] -> 1
        if pattern_gr[-1] == '':
            
            verb = pattern_gr[1]
            
            # Combinations involving all possible dictionary keys of agensts_dict
            combinations = product(self._agents_dict.keys(), self._agents_dict.keys(), repeat=1)


            for combination in combinations:
                # creating phrases of type: [EU, TU ...]
                # The other loops are necessary in these following cases: [ELE, ELA, ELES ELAS]
                for index_1, directional_1  in enumerate(self._agents_dict[combination[0]]):
                    
                    for index_2, directional_2 in enumerate(self._agents_dict[combination[1]]):
                        
                        gr = f'{directional_1} {verb} {directional_2}'
                        gi = f'{combination[0]}_{verb_gi}_{combination[1]}'
                        patterns_gr.append(gr)
                        patterns_gi.append(gi)
        
        # [pronome reto, pronome obliquo atono, verbo] -> 2
        elif pattern_gr[0] == '':
            
            verb = pattern_gr[-1]
            # Combinations involving all possible dictionary keys of pronouns_dict
            combinations = product(self._agents_dict.keys(), self._pronouns_dict.keys(), repeat=1)
            
            for combination in combinations:
                # creating phrases of type: [EU, TU ...]
                # The other loops are necessary in these following cases: [ELE, ELA, ELES ELAS]
                for index_1, directional_1  in enumerate(self._agents_dict[combination[0]]):
                    for index_2, directional_2 in enumerate(self._pronouns_dict[combination[1]]):

                        gr = f'{directional_1} {directional_2} {verb}'
                        gi = f'{combination[0]}_{verb_gi}_{combination[1]}'
                        patterns_gr.append(gr)
                        patterns_gi.append(gi)
        
        return patterns_gr,patterns_gi
    
    
    def for_string(*args,**kwargs):
        '''Transforms a pattern ([pronome reto -> verbo -> pronome reto] 
            or [pronome reto -> pronome obliquo -> verbo])
            in a string to search for what comes before and after '''
        
        if kwargs.get('search_pattern'):
            search_pattern = kwargs.get('search_pattern')
            
        list_of_str = []
        
        # Transforms the pattern in a string to check
        # for what comes before and after it in the phrase
        
        for tuple_pattern  in search_pattern:
            pattern_str = ''

            for index, str_pattern in enumerate(tuple_pattern):

                if str_pattern != '':
                    pattern_str += str_pattern + ' '

            list_of_str.append(pattern_str[:-1])
                                
        return list_of_str
    
    def assembly_phrase(self,phrase,*args,**kwargs):
        '''Creates new phrases with => before_part + pattern + after_part 
        for each combination in new_patterns method [pronome reto -> verbo -> pronome reto] 
        or [pronome reto -> pronome obliquo -> verb]'''
 
        new_phrase_gr = phrase[0]
        new_phrase_gi = phrase[1]
        combination_gr = kwargs.get('combination_gr')
        combination_gi = kwargs.get('combination_gi')
        
        if kwargs.get('search_pattern'): 
            search_pattern = kwargs.get('search_pattern')  
            # Transforms search_pattern in a string  
            search_pattern_to_string = self.for_string(search_pattern = search_pattern)
            
        try:
            # Gets all patterns in GI
            gi_verbs = re.findall(self._gi_pattern, new_phrase_gi)
            
            # Creates new phrases with new directionalities elements for both gr and gi
            for i, part_string in enumerate(search_pattern_to_string):
                
                new_phrase_gr = re.sub(part_string,combination_gr[i], new_phrase_gr)
                new_phrase_gi = re.sub(gi_verbs[i], combination_gi[i], new_phrase_gi)
        
        except Exception as e:
            print(f'Error assembly_phrase regex.\n',e)
            
        return (new_phrase_gr,new_phrase_gi)
    
    def augmentation(self, gr_gi_tuple, max_new_sentences=50):
        """[Does the augmentation for both gr, gi]
        
        Arguments:
            gr_gi_tuple {(string, string)} -- [tuple containing (gr_phrase, gi_phrase)]
        
        Keyword Arguments:
            max_new_sentences {int} -- [max_new_sentencess the amount of new phrases generated for
                            each (gr, gi)] (default: {50})
        
        Returns:
            [list] -- [New phrases generated]
        """        
        new_patterns_gr = []
        new_patterns_gi =[]
        new_phrases = []
        phrase_gr = gr_gi_tuple[0]

        # gets both [pronome reto -> verbo -> pronome reto]  
        # and [pronome reto -> pronome obliquo -> verbo]
        search_pattern_agent_verb , search_pattern_gi = self.find_pattern(gr_gi_tuple)   
        
        #* [pronome reto -> verbo -> pronome reto]
        # [pronome reto -> pronome obliquo -> verbo]
        if search_pattern_agent_verb and search_pattern_gi:
            
            # find all verbs in GI 
            gi_verbs = re.findall(self._gi_verb_pattern, gr_gi_tuple[1])
            # Creates new phrases for each pattern
            # [pronome reto -> pronome obliquo -> verbo]
            for i, patterns_in_phrase in enumerate(search_pattern_agent_verb):
                gr,gi = self.new_patterns(patterns_in_phrase, 
                                          gi_verbs[i])
                new_patterns_gr.append(gr)
                new_patterns_gi.append(gi)            


            # Combinating patterns of the newly created phrases
            combination_patterns_gr = product(*new_patterns_gr)
            combination_patterns_gi = product(*new_patterns_gi)

            for combination_gr in combination_patterns_gr:
                # New Patterns for GI 
                combination_gi = next(combination_patterns_gi)

                # Generates new phrases [befores_string + pattern + after_string] 
                # if the following pattern is encountered 
                # [pronome reto -> pronome obliquo -> verbo]
                # Generates new phrases [befores_string + pattern + after_string] 
                # if the following pattern is encountered 
                # [pronome reto -> verbo -> pronome reto]
                new_phrases.append(self.assembly_phrase(gr_gi_tuple,
                                                        search_pattern = search_pattern_agent_verb,
                                                        combination_gr = combination_gr,
                                                        combination_gi = combination_gi))
            try:
                new_phrases.remove(gr_gi_tuple)
            except Exception as e:
                #print(e)
                pass
            finally:
                # Shuffle to return {max_new_sentences ( default_val = 50 )} random phrases
                random.shuffle(new_phrases)
                new_phrases = new_phrases[:max_new_sentences]
                new_phrases.insert(0,gr_gi_tuple)
                return new_phrases
        else:
            new_phrases.insert(0,gr_gi_tuple)
            return new_phrases
                    
               
    
    def process(self,data, max_new_sentences = 50):
        # Used to do the augmentation in phrases
        new_phrases = []
        for phrase in data:
            new_phrases.extend(self.augmentation(phrase, max_new_sentences = max_new_sentences))
        return new_phrases


# Main

Já é possível criar as frases GR para os dois padrões descritos como: [agente -> verbo -> receptor] e [agente -> pronome -> verbo]

## Testando com dados reais

In [248]:
_valid_chars ='A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇa-záéíóúàâêôãõüç_'
_high_valid_chars = '['+_valid_chars+']+'

#GI pattern filter 
_gi_verb_pattern = '[1-3][SP]_('+_high_valid_chars+')_[1-3][SP]'
_gi_pattern = '[1-3][SP]_'+_high_valid_chars+'_[1-3][SP]'

# pronome reto
_agent_pattern = r'\b(EU|TU|ELE|ELA|NÓS|VÓS|ELES|ELAS)\b'

# verbs
_verb_pattern = r'\b({0}R|DO_QUE)\b'.format(_high_valid_chars)

# pronome obliquo atono
_pronoun_pattern = r'\b(ME|TE|LHE|VOS|NOS|LHES)\b'

# both patterns -> [pronome reto, verbo, pronome reto] 
# e [pronome reto, pronome obliquo atono, verbo]
_pattern_agent_verb = "%s %s %s" %(_agent_pattern, _verb_pattern, _agent_pattern)
_pattern_pronoun_verb = "%s %s %s" %(_agent_pattern, _pronoun_pattern, _verb_pattern)


### Dados equipe traducão vlibras 2019
 ``datasets/CorpusCondensadoGeral/v1``

In [249]:
test_data = pd.read_csv('test_data/gr_gi.csv')
test_data.columns = ['gr','gi']

In [250]:
# Pegando as frases que possuem a necessidade de augmentation
# [agente -> verbo -> receptor] and [agente -> pronome -> verbo]
cont = 0
phrases_augmentation = []
for i in range (len(test_data)):
    try:
        phrase = (test_data['gr'][i],test_data['gi'][i])
        
        search_pattern_gr, search_pattern_gi = da.find_pattern(phrase)
        
        if search_pattern_gr and search_pattern_gi:
            phrases_augmentation.append(phrase)
    except Exception as e:
        print(e)

find_pattern: expected string or bytes-like object
(nan, 'DESTACAR TAMBÉM PLANURA MONTE ARTIFICIAL NÃO ABAULAR PERFEITO NIVELADO LEMBRAR TERRENO PÁTEO  AMPLO SEM TERRA COBRIR_LOCAL [PONTO]')


## phrases_augmentation

In [251]:
with open ('phrases_augmentation.txt','w') as file:
    
    for phrase in phrases_augmentation:
        file.write(f'{phrase}\n')
phrases_augmentation

[('EU TE CONHECER [PONTO]', '1S_CONHECER_2S [PONTO]'),
 ('EU LHE DAR BONECA [PONTO]', '1S_DAR_2S BONECA [PONTO]'),
 ('EU TE DAR CADERNO [PONTO]', '1S_DAR_2S CADERNO [PONTO]'),
 ('EU LHE DAR CADERNO [PONTO]', '1S_DAR_2S CADERNO [PONTO]'),
 ('EU LHE DAR CANETA [PONTO]', '1S_DAR_2S CANETA [PONTO]'),
 ('EU LHE DAR LIVRO [PONTO]', '1S_DAR_2S LIVRO [PONTO]'),
 ('EU TE MOSTRAR CIDADE [PONTO]', '1S_MOSTRAR_2S CIDADE [PONTO]'),
 ('EU LHE PEDIR QUE RECONSIDERAR [PONTO]', '1S_PEDIR_2S RECONSIDERAR [PONTO]'),
 ('EU PEDIR ELES PARA CONSERTAR MEU CARRO [PONTO]',
  '1S_PEDIR_3S CONSERTAR MEU CARRO [PONTO]'),
 ('ELA LHE PERGUNTAR ONDE ELE MORAR [PONTO]',
  '1S_PERGUNTAR_2S MORAR  [PONTO]'),
 ('EU ME PERGUNTAR PORQUÊ ÔNIBUS ESTAR ATRASADO [PONTO]',
  '1S_PERGUNTAR_2S PORQUE ÔNIBUS ATRASAR [PONTO]'),
 ('EU TE SEGUIR [PONTO]', '1S_SEGUIR_2S [PONTO]'),
 ('EU TE VER MEUS SONHO [PONTO]', '1S_VER_2S MEU SONHO [PONTO]'),
 ('EU VER ELA OUTRO DIA [PONTO]', '1S_VER_3S DIA ANTES_PASSADO [PONTO]'),
 ('EU VER ELA V

In [254]:
# Total de frases [agente -> pronome -> verbo] [agente -> verbo -> receptor]
len(phrases_augmentation)

51

In [255]:
# Frases [agente -> pronome -> verbo] [agente -> verbo -> receptor]
phrases_augmentation[random.randint(0,len(phrases_augmentation))]

('EU ME PERGUNTAR PORQUÊ ÔNIBUS ESTAR ATRASADO [PONTO]',
 '1S_PERGUNTAR_2S PORQUE ÔNIBUS ATRASAR [PONTO]')

## Teste com frases para augmentation encontradas

In [274]:
da = Directionality_Augmentation()
new_phrases = da.process(phrases_augmentation,max_new_sentences = 555555)
len(new_phrases)

2592