## Terminology - Project
Authors: Cécile MACAIRE & Ludivine ROBERT 

## Librairies 

In [1]:
import pandas as pd
import spacy
spacy_nlp = spacy.load('en_core_web_sm')

## Code 

In [2]:
# Read data from lexicon
def read_data(file):
    """Read data file with pandas dataframe"""
    return pd.read_csv(file, sep='\t')

def lemma_lexicon(dataframe):
    """Lemmatization of lexicon with scapy"""
    terms = dataframe['pilot']
    lemma = []
    for el in terms:
        doc = spacy_nlp(el.lower())
        tmp = [token.lemma_ for token in doc]
        lemma.append(' '.join(tmp))
    dataframe['lemma'].replace(lemma)
    return dataframe
  
def select_data(dataframe):
    """We keep only columns pattern, pilot and lemma"""
    return dataframe[['pattern', 'pilot', 'lemma']]

In [40]:
# Extract text
def read_file(file):
    with open(file, 'r') as f:
        return f.read()
        
def lemma_posttag(file):
    """Convert post-tag scapy into corresponding pattern from lexicon"""
    text = read_file(file)
    doc_a = spacy_nlp(text)
    doc = spacy_nlp(text.lower())
    new_pos = []
    pos = []
    lemma = []
    t = []
    original = [token.text for token in doc_a]
#     original.append('')
    for token in doc:
        t.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
            new_pos.append('N')
        elif token.pos_ == 'VERB':
            new_pos.append('V')
        elif token.pos_ == 'ADJ':
            new_pos.append('A')
        elif token.pos_ == 'CCONJ' or token.pos_ == 'SCONJ':
            new_pos.append('C')
        elif token.pos_ == 'PART' or token.pos_ == 'ADP':
            new_pos.append('P')
        else:
            new_pos.append('')
    frame = pd.DataFrame({'tokens': original,'tokens_lower':t, 'lemma':lemma, 'pos':pos, 'pattern':new_pos})
    return frame

In [41]:
def rules(terms_dataframe, text_dataframe):
    """Define rules from terms according to their pattern"""
    new_terms = []
    for terms in terms_dataframe['lemma']:
        # Get the same structure of terms as in text dataframe
        tmp = ' '.join(terms.split('-'))
        new_terms.append(tmp.split(' '))
    for i, token in enumerate(text_dataframe['lemma']):
        for j, t in enumerate(new_terms):
            # Case 1: term of size 3 seperated by dashes (ex: text-to-speech) and followed by 1 or 2 Nouns is a term 
            if len(t) == 3 and len(text_dataframe['lemma']) >= i+3:
                if token == t[0] and text_dataframe['lemma'][i+1] == '-' and text_dataframe['lemma'][i+2] == 'to' or text_dataframe['lemma'][i+2] == 'of' and text_dataframe['lemma'][i+3] == '-' and text_dataframe['lemma'][i+4] == t[2]:
                    # followed by 2 nouns (ex: text-to-speech modal synthesis)
                    if text_dataframe['pattern'][i+5] == 'N':
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+5] = text_dataframe['tokens'][i+5]+']'
                    else:
                        # followed by 1 noun (ex: text-to-speech system)
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+4] = text_dataframe['tokens'][i+4]+']' 
            # Case 2: term of size 2 separated by dashes (ex: encoder-decoder) and followed by 0,1,2 or 3 nouns is a term
            elif len(t) >= 2 and len(text_dataframe['lemma']) >= i+3 and i != 0:
                if token == 'front' and text_dataframe['lemma'][i+1] == '-' and text_dataframe['lemma'][i+2] == 'end':
                    if text_dataframe['pattern'][i-1] == 'N':
                        text_dataframe['tokens'][i-1] = '['+text_dataframe['tokens'][i-1]
                        text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
                if token == t[0] and text_dataframe['lemma'][i+1] == '-' and text_dataframe['lemma'][i+2] == t[1]:
                    # followed by 3 nouns (ex: HMM-based generation synthesis approach)
                    if len(t) == 5:
                        if text_dataframe['pattern'][i+3] == 'N' and text_dataframe['pattern'][i+4] == 'N' and text_dataframe['pattern'][i+5] == 'N':
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+5] = text_dataframe['tokens'][i+5]+']'
                    # followed by 2 nouns (ex: HMM-based generation synthesis)
                    elif len(t) == 4:
                        if text_dataframe['pattern'][i+3] == 'N' and text_dataframe['pattern'][i+4] == 'N':
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+4] = text_dataframe['tokens'][i+4]+']'
                    # followed by 1 noun (ex: cross-lingual adaptation)
                    elif len(t) == 3:
                        if text_dataframe['pattern'][i+3] == 'N':
                            text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                            text_dataframe['tokens'][i+3] = text_dataframe['tokens'][i+3]+']'
                    # followed by nothing (ex: mel-spectrogram)
                    elif len(t) == 2:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
        if token == 'data' or token == 'datum' or token == 'speaker' or token == 'dataset' or token == 'database' or token == 'feature' or token == 'corpus' or token == 'language' and i != 0 and len(text_dataframe['lemma']) >= i+1:
            if text_dataframe['pattern'][i-1] == 'N' or text_dataframe['pattern'][i-1] == 'A':
                text_dataframe['tokens'][i-1] = '['+text_dataframe['tokens'][i-1]
                text_dataframe['tokens'][i] = text_dataframe['tokens'][i]+']'
            elif text_dataframe['pattern'][i+1] == 'N':
                text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                text_dataframe['tokens'][i+1] = text_dataframe['tokens'][i+1]+']'

In [115]:
rule_4 = ['system', 'model', 'synthesis', 'translation', 'recognition', 'signal', 'research', 'processing', 'conversion', 'technique', 'accuracy', 'synthesizer']
def annotate(terms_dataframe, text_dataframe):
    """Annotate the terms of the text thanks to list of terms + applied rules"""
    rules(terms_dataframe, text_dataframe)  # apply rules
    for i, token in enumerate(text_dataframe['lemma']):
        for term in terms_dataframe['lemma']:
            term = term.split(' ')
            # Case 1: if terms of length 4, we check if each word from text corresponds to each word in the term
            if len(term) == 4:
                term_1 = term[0]
                if token == term_1 and len(text_dataframe['lemma']) > i+4:
                    if text_dataframe['lemma'][i+1] == term[1] and text_dataframe['lemma'][i+2] == term[2] and text_dataframe['lemma'][i+3] == term[3]:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+3] = text_dataframe['tokens'][i+3]+']'
                        i += 3
            # Case 2: terms of length 3
            elif len(term) == 3:
                term_1 = term[0]
                if token == term_1 and len(text_dataframe['lemma']) > i+3:
                    if text_dataframe['lemma'][i+1] == term[1] and text_dataframe['lemma'][i+2] == term[2]:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+2] = text_dataframe['tokens'][i+2]+']'
                        i += 2
            # Case 3: terms of length 2
            elif len(term) == 2:
                if token == term[0] and len(text_dataframe['lemma']) > i+2:
                    if text_dataframe['lemma'][i+1] == term[1]:
                        text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                        text_dataframe['tokens'][i+1] = text_dataframe['tokens'][i+1]+']'
                        i += 1
            # Case 4: term of length 1
            elif token == term[0] and i > 1 and text_dataframe['lemma'][i-1] == 'of' and text_dataframe['lemma'][i-2] == 'sequence':
                text_dataframe['tokens'][i-2] = '['+text_dataframe['tokens'][i-2]
                text_dataframe['tokens'][i] = text_dataframe['tokens'][i]+']'
            elif token == term[0] and len(term) == 1 and len(text_dataframe['lemma']) >= i+1:
                if text_dataframe['lemma'][i+1] in rule_4:
                    text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]
                    text_dataframe['tokens'][i+1] = text_dataframe['tokens'][i+1]+']'
                else:
                    text_dataframe['tokens'][i] = '['+text_dataframe['tokens'][i]+']'
    return text_dataframe

In [116]:
def construct_annotated_text(text_dataframe):
    """Return the text from the annotated text dataframe with the correct annotation of brackets"""
    content = ' '.join(text_dataframe['tokens'].to_list())
    compt = 0
    compt2 = 0
    string = ''
    for i in content:
        if i == '[':
            if compt == 0:
                compt += 1
                string += i
            elif compt >= 1:
                compt += 1
        elif i == ']':
            if compt-1 != compt2:
                compt2 += 1
            else:
                string += i
                compt = 0
                compt2 = 0
        else:
            string += i
    string2 = ''
    string = string.replace('] [', ' ')
    string = string.replace(' .', '.')
    string = string.replace(' ’', '’')
    string = string.replace(' ,', ',')
    string = string.replace(' - ', '-')
    string = string.replace('( ', '(')
    string = string.replace(' )', ')')
    return string

In [118]:
if __name__== "__main__":
    """Main -> to modify by putting all steps in one fonction"""
    init_data = read_data('tts-lexicon4.tsv')
    change_lemma = lemma_lexicon(init_data)
    data = select_data(change_lemma)
    text_dataframe = lemma_posttag('/home/macaire/Bureau/M2_NLP/Terminology/terminology_project/tts-articles/txt/11.txt')
#     text_dataframe = lemma_posttag('test2.txt')
    annotate(data, text_dataframe)
#     text_dataframe.to_csv(r'pandatext.txt', header=None, index=None, sep=' ', mode='a')
#     print(text_dataframe.tail(50))
#     print(data.head(50))
#     print(' '.join(text_dataframe['tokens'].to_list()))
    print(construct_annotated_text(text_dataframe))

[Grapheme-to-Phoneme Conversion] with [Convolutional Neural Networks] 
          


 Abstract : [Grapheme-to-phoneme] ([G2P]) conversion is the process of generating pronunciation for words based on their written form. 
 It has a highly essential role for [natural language processing], [text-to-speech synthesis] and [automatic speech recognition] systems. 
 In this paper, we investigate [convolutional neural networks] (CNN) for [G2P conversion]. 
 We propose a novel CNN-based [sequence-to-sequence] ([seq2seq]) architecture for [G2P conversion]. 
 Our approach includes an [end-to-end CNN G2P conversion] with [residual connections] and, furthermore, a model that utilizes a [convolutional neural network] (with and without [residual connections]) as encoder and [Bi-LSTM] as a decoder. 
 We compare our approach with state-of-the-art methods, including [Encoder-Decoder LSTM] and [Encoder-Decoder Bi-LSTM]. 
 Training and inference times, [phoneme] and [word error rates] were evaluated on the 