## Terminology - Project
Authors: Cécile MACAIRE & Ludivine ROBERT 

## Librairies 

In [1]:
import pandas as pd
import spacy
spacy_nlp = spacy.load('en_core_web_sm')

## Code 

In [2]:
# Read data from lexicon
def read_data(file):
    """Read data file with pandas dataframe"""
    return pd.read_csv(file, sep='\t')

def select_data(dataframe):
    """Lemmatization of lexicon with scapy"""
    terms = dataframe['pilot']
    lemma = []
    for el in terms:
        doc = spacy_nlp(el.lower())
        tmp = [token.lemma_ for token in doc]
        lemma = [l.replace(' - ', '-') for l in lemma]
        lemma.append(' '.join(tmp))
    df = pd.DataFrame({'pattern':dataframe['pattern'], 'pilot':dataframe['pilot'], 'lemma':lemma})
    return df

In [3]:
# Extract text
def read_file(file):
    with open(file, 'r') as f:
        return f.read()
        
def lemma_posttag(file):
    """Convert post-tag scapy into corresponding pattern from lexicon"""
    text = read_file(file)
    doc_a = spacy_nlp(text)
    doc = spacy_nlp(text.lower())
    new_pos = []
    pos = []
    lemma = []
    t = []
    original = [token.text for token in doc_a]
    for token in doc:
        t.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
            new_pos.append('N')
        elif token.pos_ == 'VERB':
            new_pos.append('V')
        elif token.pos_ == 'ADJ':
            new_pos.append('A')
        elif token.pos_ == 'CCONJ' or token.pos_ == 'SCONJ':
            new_pos.append('C')
        elif token.pos_ == 'PART' or token.pos_ == 'ADP':
            new_pos.append('P')
        else:
            new_pos.append('')
    frame = pd.DataFrame({'tokens': original,'tokens_lower':t, 'lemma':lemma, 'pos':pos, 'pattern':new_pos})
    return frame

In [4]:
def rules(terms_dataframe, text_dataframe):
    """Define rules from terms according to their pattern"""
    new_terms = []
    for terms in terms_dataframe['lemma']:
        # Get the same structure of terms as in text dataframe
        tmp = ' '.join(terms.split('-'))
        new_terms.append(tmp.split(' '))
    for i, token in enumerate(text_dataframe['lemma']):
        for j, t in enumerate(new_terms):
            # Case 1: term of size 3 seperated by dashes (ex: text-to-speech) and followed by 1, 2 Nouns or 1 Adj and 1 Noun is a term 
            if len(t) == 3 and len(text_dataframe['lemma']) >= i+5:
                if token == t[0] and text_dataframe['lemma'][i+1] == '-' and (text_dataframe['lemma'][i+2] == 'to' or text_dataframe['lemma'][i+2] == 'of' or text_dataframe['pattern'][i+2] == 'N') and text_dataframe['lemma'][i+3] == '-' and text_dataframe['lemma'][i+4] == t[2]:
                    # followed by 2 nouns (ex: text-to-speech modal synthesis)
                    if (text_dataframe['pattern'][i+4] == 'N' or text_dataframe['pattern'][i+4] == 'A') and text_dataframe['pattern'][i+5] == 'N':
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+5] = text_dataframe['tokens'][i+5]+']'                        
                    elif text_dataframe['pattern'][i+4] == 'N':
                        # followed by 1 noun (ex: text-to-speech system)
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+4] = text_dataframe['tokens'][i+4]+']'
                    else:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]+']'
            # Case 2: term of size 2 separated by dashes (ex: encoder-decoder) and followed by 0,1,2 or 3 nouns is a term
            elif len(t) >= 2 and len(text_dataframe['lemma']) >= i+3 and i != 0:
                if token == 'front' and text_dataframe['lemma'][i+1] == '-' and text_dataframe['lemma'][i+2] == 'end':
                    if text_dataframe['pattern'][i-1] == 'N':
                        text_dataframe['tokens'][i-1] = '['+text_dataframe['tokens'][i-1]
                        text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
                if token == t[0] and text_dataframe['lemma'][i+1] == '-' and text_dataframe['lemma'][i+2] == t[1]:
                    # followed by 3 nouns (ex: HMM-based generation synthesis approach)
                    if text_dataframe['pattern'][i+3] == 'N' and text_dataframe['pattern'][i+4] == 'N' and text_dataframe['pattern'][i+5] == 'N':
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+5] = text_dataframe['tokens'][i+5]+']'
                    # followed by 2 nouns (ex: HMM-based generation synthesis)
                    elif (text_dataframe['pattern'][i+3] == 'N' or text_dataframe['pattern'][i+3] == 'A') and text_dataframe['pattern'][i+4] == 'N':
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+4] = text_dataframe['tokens'][i+4]+']'
                    # followed by 1 noun (ex: cross-lingual adaptation)
                    elif text_dataframe['pattern'][i+3] == 'N':
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+3] = text_dataframe['tokens'][i+3]+']'
                    # followed by nothing (ex: mel-spectrogram)
                    else:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
        if (token == 'data' or token == 'datum' or token == 'speaker' or token == 'dataset' or token == 'database' or token == 'feature' or token == 'corpus' or token == 'language' token == 'voice') and i != 0 and len(text_dataframe['lemma']) >= i+1:
            if text_dataframe['pattern'][i-1] == 'N' or text_dataframe['pattern'][i-1] == 'A':
                text_dataframe['tokens'][i-1] = '['+text_dataframe['tokens'][i-1]
                text_dataframe['tokens'][i] = text_dataframe['tokens'][i]+']'
            elif text_dataframe['pattern'][i+1] == 'N':
                text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                text_dataframe['tokens'][i+1] = text_dataframe['tokens'][i+1]+']'

In [5]:
rule_4 = ['system', 'model', 'synthesis', 'translation', 'recognition', 'signal', 'research', 
          'processing', 'conversion', 'technique', 'accuracy', 'synthesizer', 'architecture', 
          'form', 'transcription', 'alignment', 'optimization', 'task', 'function','token',
         'activation', 'layer', 'experiment', 'output', 'representation', 'setting', 'control', 
         'network', 'quality', 'file', 'learning', 'framework', 'transform', 'sequence', 'length',
         'tool', 'process', 'method']
def annotate(terms_dataframe, text_dataframe):
    """Annotate the terms of the text thanks to list of terms + applied rules"""
    rules(terms_dataframe, text_dataframe)  # apply rules
    for i, token in enumerate(text_dataframe['lemma']):
        for term in terms_dataframe['lemma']:
            term = term.split(' ')
            # Case 1: if terms of length 4, we check if each word from text corresponds to each word in the term
            if len(term) == 4:
                term_1 = term[0]
                if token == term_1 and len(text_dataframe['lemma']) >= i+4:
                    if text_dataframe['lemma'][i+1] == term[1] and text_dataframe['lemma'][i+2] == term[2] and text_dataframe['lemma'][i+3] == term[3]:
                        if text_dataframe['lemma'][i+4] in rule_4:
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+4] = text_dataframe['tokens'][i+4]+']'
                        else:
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+3] = text_dataframe['tokens'][i+3]+']'
            # Case 2: terms of length 3
            elif len(term) == 3:
                term_1 = term[0]
                if token == term_1 and len(text_dataframe['lemma']) > i+3:
                    if text_dataframe['lemma'][i+1] == term[1] and text_dataframe['lemma'][i+2] == term[2]:
                        if text_dataframe['lemma'][i+3] in rule_4:
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+3] = text_dataframe['tokens'][i+3]+']'
                        else:
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
            # Case 3: terms of length 2
            elif len(term) == 2:
                if token == term[0] and len(text_dataframe['lemma']) > i+2:
                    if text_dataframe['lemma'][i+1] == term[1]:
                        if text_dataframe['lemma'][i+2] in rule_4:
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
                        else:
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+1] = text_dataframe['tokens'][i+1]+']'
            # Case 4: term of length 1
            elif token == term[0] and i > 1 and text_dataframe['lemma'][i-1] == 'of' and text_dataframe['lemma'][i-2] == 'sequence':
                text_dataframe['tokens'][i-2] = '['+text_dataframe['tokens'][i-2]
                text_dataframe['tokens'][i] = text_dataframe['tokens'][i]+']'
            elif token == term[0] and len(term) == 1 and len(text_dataframe['lemma']) >= i+2 and text_dataframe['lemma'][i+1] == ')':
                if text_dataframe['lemma'][i+2] in rule_4:
                    text_dataframe['tokens'][i-1] = '['+text_dataframe['tokens'][i-1]
                    text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
                else:
                    text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]+']'
            elif token == term[0] and len(term) == 1 and len(text_dataframe['lemma']) >= i+1:
                if text_dataframe['lemma'][i+1] in rule_4:
                    text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                    text_dataframe['tokens'][i+1] = text_dataframe['tokens'][i+1]+']'
                else:
                    text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]+']'
    return text_dataframe

In [6]:
def construct_annotated_text(text_dataframe):
    """Return the text from the annotated text dataframe with the correct annotation of brackets"""
    content = ' '.join(text_dataframe['tokens'].to_list())
    compt = 0
    compt2 = 0
    string = ''
    for i in content:
        if i == '[':
            if compt == 0:
                compt += 1
                string += i
            elif compt >= 1:
                compt += 1
        elif i == ']':
            if compt-1 != compt2:
                compt2 += 1
            else:
                string += i
                compt = 0
                compt2 = 0
        else:
            string += i
    string2 = ''
    string = string.replace('] [', ' ')
    string = string.replace(' .', '.')
    string = string.replace(' ’', '’')
    string = string.replace(' ,', ',')
    string = string.replace(' - ', '-')
    string = string.replace('( ', '(')
    string = string.replace(' )', ')')
    string = string.replace(']-[', '-')
    string = string.replace('.]', '].')
    return string

In [17]:
if __name__== "__main__":
    """Main -> to modify by putting all steps in one fonction"""
    init_data = read_data('tts-lexicon4.tsv')
    data = select_data(init_data)
    text_dataframe = lemma_posttag('tts-articles/txt/2.txt')
#     text_dataframe = lemma_posttag('test2.txt')
#     print(text_dataframe.head(60))
#     data.to_csv(r'terms.txt', header=None, index=None, sep=' ', mode='a')
    annotate(data, text_dataframe)
    print(construct_annotated_text(text_dataframe))

Current trends in [multilingual speech processing] 

 Abstract. 
 In this paper, we describe recent work at Idiap Research Institute in the domain of [multilingual speech processing] and provide some insights into emerging challenges for the [research community]. 
 [Multilingual speech processing] has been a topic of ongoing interest to the [research community] for many years and the field is now receiving renewed interest owing to two strong driving forces. 
 Firstly, technical advances in [speech recognition] and synthesis are posing new challenges and opportunities to researchers. 
 For example, [discriminative features] are seeing wide application by the [speech recognition] community, but additional issues arise when using [such features] in a [multilingual setting]. 
 Another example is the apparent convergence of [speech recognition] and [speech synthesis] technologies in the form of statistical parametric methodologies. 
 This convergence enables the investigation of new approa