In [1]:
# Keep this
import os
import pickle 
import numpy as np
import argparse
import  nltk 
import time
from nltk import Tree, word_tokenize
import numpy as np
from nltk.grammar import Production
import re
from PYEVALB import parser as evalbparser
# Here compute all util functions
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm
import random
import string
from collections import Counter , defaultdict
from itertools import product 

In [2]:
import nltk
nltk.download('punkt') # in requirements

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fatmamoalla/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [78]:
train_size, test_size = 0.8,0.1
path_corpus = 'sequoia-corpus+fct.mrg_strict'
path_polyglot = 'polyglot-fr.pkl'
## In main
print('Process corpus =====')
start =time.time()
corpus = get_all_corpus(path=path_corpus)
print('Corpus size =',len(list(corpus.keys())))
tokenized_corpus = [Tree.fromstring(sentence).leaves() for sentence in corpus.values()]
train, val, test = split_train_test(tokenized_corpus, train_size=0.8, val_size=0.1, test_size=0.1)
# Here the corpus contains tags 
train_corpus, val_corpus , test_corpus = split_train_test(list(corpus.values()), train_size=0.8, val_size=0.1, test_size=0.1)
print( 'Train =',len(train),'Val =',len(val),'Test =',len(test))
end = time.time()
print('Finihed in :',end-start)
print('Build vocabulary =====')
start =time.time()
vocab = get_vocabulary (train)
print('Training Vocabulary size = ',len(vocab))
word2id = {word: idx for (idx, word) in enumerate(vocab)}
id2word = dict(enumerate(vocab))
end = time.time()
print('Finihed in :',end-start)


Process corpus =====
Corpus size = 3099
Train = 2479 Val = 310 Test = 310
Finihed in : 1.1513080596923828
Build vocabulary =====
Training Vocabulary size =  8958
Finihed in : 0.06253600120544434


In [97]:
# Helpers function 

### Get corpus functions
def get_all_corpus(path= 'sequoia-corpus+fct.mrg_strict'):
    """ Extract the input corpus of sentences and remove hyphen tags """
    full_path = path
    corpus = {}
    with open(full_path,encoding='utf-8') as f:
        for i , line in enumerate(f):
            sent =line.rstrip().split(" ")
            sent=[word.split("-")[0] if word[0]=='(' else word for word in sent]
            corpus[i]=" ".join(sent)
    return corpus
def split_train_test(corpus, train_size=0.8, val_size=0.1, test_size=0.1):
    """ Split the corpus into : train (80%),val(10%), test(10%) """
    # Maybe add shuffle
    
    n = len(corpus)
    train_idx = int(n * train_size)
    val_idx = int(n*(val_size+train_size))
    train = corpus [:train_idx]
    val = corpus [train_idx:val_idx]
    test = corpus [val_idx:]
    return train, val, test

def get_vocabulary (train_corpus):
    vocab = []
    for sentence in train_corpus : 
         vocab.extend(sentence)
    vocab = np.unique(vocab)
    return vocab

### Get embeddings from polyglot
def get_fr_word_embedding(vocab,path = 'polyglot-fr.pkl'):
    """ From the Fr plyglot lexicon, extrcat words and embeddings"""
    full_path = path
    with open(full_path, 'rb') as f:
        polyglot = pickle._Unpickler(f)
        polyglot.encoding = 'latin1'
        words, embeddings =   polyglot.load()
        #all_words = np.array(list(set(vocab)&set(words)))
        w2embed = {word:embedding for word, embedding in zip(words, embeddings)}
        
    return w2embed

### Distances definition and neighbors


def cosine_similarity(embed_vec,w2embed):
    """
    Calculate cosine similarity between embed_vec(current embedding word)
    """
    embeddings =list(w2embed.values())
    inner_embed = np.inner(embeddings,embed_vec)
    sim_score = inner_embed / (np.linalg.norm(embed_vec)*np.linalg.norm(embeddings,axis=1))
    return sim_score

def closest_word_embed(word, w2embed, id2word_plyglot, embed_neigh=10):
    if word in w2embed.keys():
        
        vector = w2embed[word]
        similar_vectors  = cosine_similarity(vector,w2embed)
        candidates_id = np.flip(np.argsort(similar_vectors)[-embed_neigh:])
        embed_candidates = [id2word_plyglot[idx] for idx in candidates_id ]
        return embed_candidates
      
    else:
        return []

### OOV replacement 
def levenstein_distance(sent1,sent2):
    """
    Computes the levenstein distance between two sentences
    """
    n,p = len(sent1)+1 , len(sent2)+1
    m = np.zeros((n,p))
    m[:,0] = np.arange(n)
    m[0,:] = np.arange(p)
    for i in range(1,n):
        for j in range(1,p):
            if  sent1[i-1] == sent2[j-1]:
                m[i,j] = min([m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1]])
            else : 
                m[i,j] = min([m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1]+1])
    
    return m[n-1,p-1]

def levenstein_candidates(word,vocab,lev_neigh = 5):
    """ 
    Find the corespondant word for each oov word
    """  
    lev =np.vectorize(lambda w : levenstein_distance(word.lower(),w.lower()))
    voba_lev_distances = lev(vocab)
    result = [new_word[1] for new_word in sorted(zip(voba_lev_distances,vocab))[:lev_neigh]]
    ## result is a word
    return result

def update_vocab_embeddings(word2id, w2embed):
    inter = set(word2id.keys()).intersection(set(w2embed.keys()))
    inter_word2id = {w : word2id[w] for w in inter}
    size_vocab = len(list(inter_word2id.keys()))
    inter_w2embed = {w : w2embed[w] for w in inter}
    size_embed = len(list(inter_w2embed.keys()))
    print('Intersection between vocab and embeddings', 'new_vocab_size=',size_vocab, 'new_embed_size=',)
    return inter_word2id, inter_w2embed
    
    
    

In [98]:
# !!! takes ~1-2min to run !!!
print('Get embedding from polyglot Fr =====')
start =time.time()
w2embed = get_fr_word_embedding(vocab,path = path_polyglot)
id2word_plyglot = dict(enumerate(w2embed.keys()))
end =time.time()
print('Finihed in :',end-start)
### Update vocab, embed

inter_word2id, inter_w2embed =update_vocab_embeddings(word2id, w2embed)

Get embedding from polyglot Fr =====
Finihed in : 95.75415396690369
Intersection between vocab and embeddings new_vocab_size= 6951 new_embed_size=


In [83]:
### OOV module
print('Incorporating context: bigrams  =====')
unigrams, bigrams = uni_bi_grams(vocab, train, word2id)

print('length of unigrams/bigrams:',len(unigrams))
# check 

Incorporating context: bigrams  =====
Build Unigrams from train ===
Build Bigrams from train ===
length of unigrams/bigrams: 8958


In [122]:

def uni_bi_grams(vocab, sentences, word2id):
    n= len(vocab)
    bigrams= np.ones((n,n))
    unigrams= np.zeros(n)
    print('Build Unigrams from train ===')
    for sentence in sentences:
        for word in sentence:
            unigrams[word2id[word]] +=1
    norm_uni = np.sum(unigrams)
    unigrams/=norm_uni
    
    print('Build Bigrams from train ===')
    for sentence in sentences:
        for i,word in enumerate(sentence):
            bigrams[word2id[sentence[i-1]],word2id[word]] +=1
            
    norm_bi = np.sum(bigrams, axis = 1)[:, None]
    bigrams/=norm_bi
    return unigrams, bigrams 

def score_context(idx, word, sentence,word2id,unigrams, bigrams,coef =0.2): # Process_word function
    
    # We take the log to avoid overflow
    if idx ==0:
        return np.log(unigrams[word2id[word]])
    else: 
        previous_word = sentence[-1]
        score = coef* unigrams[word2id[word]] + (1-coef)*bigrams[word2id[previous_word],word2id[word]]
        
        return np.log(score)
    
def get_new_words(word,vocab,w2embed,word2id,unigrams, bigrams,id2word_plyglot,lev_neigh=10,embed_neigh=20):
    candidates =[]
    max_iter = 20
    i=0
    while len(candidates)==0 and i<max_iter :
        lev_list = levenstein_candidates(word,vocab,lev_neigh)
        embed_list = closest_word_embed(word, w2embed, id2word_plyglot, embed_neigh)
        candidates = set(embed_list).intersection(set(lev_list)) 
        lev_neigh+=1
        i+=1
    return candidates
    


# In helpers
def OOV(sentence , vocab, w2embed,word2id,unigrams, bigrams,id2word_plyglot,lev_neigh=10,embed_neigh=20):
    score =0
    replacement = []
    tokens = Tree.fromstring(sentence).leaves()
    #print(tokens)
    
    for (idx,word) in enumerate(tokens):
            
            if word in vocab:
                #print('word in vocab = ',word)
                score += score_context(idx, word, replacement,word2id,unigrams,bigrams,coef =0.2)
                replacement.append(word)   
                
            else:   
                #print('word not in vocab = ',word)
                correction =[]
                candidates = get_new_words(word,vocab,w2embed,word2id,unigrams, bigrams,id2word_plyglot,lev_neigh=10,embed_neigh=20)
                for new_word in candidates :
                    correction.append([new_word,score_context(idx, new_word, replacement,word2id,unigrams, bigrams,coef =0.2)])
                
                if len(correction)>0:
                    best_combination = sorted(correction)[-1]
                    score+=  best_combination[1] 
                    replacement.append(best_combination[0])
                    #print('replacement',replacement)
                #else:print('No replacment is found')
    return " ".join(replacement)

In [109]:
# test : to delete
sentence1 = test_corpus[0]
print(sentence1)

OOV(sentence1, vocab, w2embed,word2id,unigrams, bigrams,id2word_plyglot,lev_neigh=10,embed_neigh=20)



( (SENT (PONCT -) (NP (ADJ 19) (NC janvier) (NC 2004)) (PONCT :) (NP (DET le) (NC juge) (NPP Armand) (NPP Riberolles)) (VN (V clôt)) (NP (DET ses) (NC investigations) (COORD (CC et) (NP (DET l') (NC instruction) (PP (P de) (NP (DET l') (NC affaire) (PP (P+D des) (NP (NC HLM) (PP (P de) (NP (NPP Paris)))))))))) (PONCT .)))
['-', '19', 'janvier', '2004', ':', 'le', 'juge', 'Armand', 'Riberolles', 'clôt', 'ses', 'investigations', 'et', "l'", 'instruction', 'de', "l'", 'affaire', 'des', 'HLM', 'de', 'Paris', '.']
word in vocab =  -
word in vocab =  19
word in vocab =  janvier
word in vocab =  2004
word in vocab =  :
word in vocab =  le
word in vocab =  juge
word not in vocab =  Armand
New candidates are found
word not in vocab =  Riberolles
No replacement found
No replacment is found
word not in vocab =  clôt
No replacement found
No replacment is found
word in vocab =  ses
word in vocab =  investigations
word in vocab =  et
word in vocab =  l'
word in vocab =  instruction
word in vocab =  

"- 19 janvier 2004 : le juge Edmond ses investigations et l' instruction de l' affaire des HLM de Paris ."

In [143]:
# Almost ok : oov

In [390]:
def get_probabilities_lexicon(lexicon_list):
    unique_pos, distinct = np.unique(lexicon_list, return_counts=True)
    unique_pos = np.array([[pos.lhs(), pos.rhs()[0].lower()]  for pos in unique_pos])
    pos_matrix = np.hstack((unique_pos,distinct.astype(np.float64).reshape(-1,1)))
    
    NT_l, x = np.unique(pos_matrix[:, 0], return_inverse=True)
    NT_r, y = np.unique(pos_matrix[:, 1], return_inverse=True)
    
    l_size, r_size = len(NT_l),len(NT_r)
    probabilities_lexicon= np.zeros((l_size, r_size))
    probabilities_lexicon[x, y] = pos_matrix[:, 2]
    probabilities_lexicon = probabilities_lexicon / np.sum(probabilities_lexicon,axis=1).reshape(-1,1)
    return probabilities_lexicon

In [None]:
# to delete
def transform_probabilities(prob_dict,word2id,pos2id):
    
    distinct_pos = list(set(pos2id.keys()))
    distinct_words = list(set(word2id.keys())) 
    
    p,n = len(distinct_pos), len(distinct_words)
    
    word_tag_prob = np.zeros((p,n))
    #word_tag_prob[np.array(map_pos.values()).reshape(-1,1),np.array(map_word.values()).reshape(-1,1)] = list(prob_dict.values())
    
    for word,idx_w in word2id.items():
        for pos, idx_p in pos2id.items():
            if (pos,word) in prob_dict.keys():
                word_tag_prob[idx_p,idx_w] = prob_dict[(pos,word)]
    
    word_tag_prob= 1.* word_tag_prob/ np.sum( word_tag_prob, axis=1).reshape(-1,1)
    return np.nan_to_num(word_tag_prob,0)

In [391]:
def extract_lexicon(train_corpus):
    lexical_grammar =defaultdict(set)
    axioms = set()
    lexicon_list = []
    start = time.time()
    for sentence in train_corpus:
        tree = Tree.fromstring(sentence, remove_empty_top_bracketing=True)
        tree.chomsky_normal_form(horzMarkov=2)
        tree.collapse_unary(collapsePOS=True, collapseRoot=True)
        prods = tree.productions()
        axioms.add(prods[0].lhs().symbol())
        lexicon_list.extend([prod for prod in prods if prod.is_lexical()])
        
    lexicon_grammar= Counter(lexicon_list) # keys are unique and counts
    rules_distinct = [[pos.lhs().symbol(),pos.rhs()[0].lower()] for pos in list(lexicon_grammar.keys())]
    rules_count_distincts =  list(lexicon_grammar.values())
    # Building lexical grammar 
    for rule in rules_distinct:
        lexical_grammar[rule[0]].add(rule[1])
    
    probabilities_lexicon = get_probabilities_lexicon(lexicon_list)
    
    pos2id = {pos : idx for (idx,pos) in enumerate(lexical_grammar.keys())} # dict_pos_tags
    return lexical_grammar, probabilities_lexicon,pos2id 
                
                
def PCFG_model(train_corpus)  :
    pcfg_grammar_dict =defaultdict(set)
    axioms = set()
    pcfg_list = []
    start = time.time()
    for sentence in train_corpus:
        tree = Tree.fromstring(sentence, remove_empty_top_bracketing=True)
        tree.chomsky_normal_form(horzMarkov=2)
        tree.collapse_unary(collapsePOS=True, collapseRoot=True)
        prods = tree.productions()
        axioms.add(prods[0].lhs().symbol())
        pcfg_list.extend([prod for prod in prods if prod.is_nonlexical()])
    pcfg_grammar= Counter(pcfg_list) # keys are unique and counts
    rules_distinct = [[pos.lhs().symbol(),pos.rhs()] for pos in list(pcfg_grammar.keys())]
    rules_count_distincts =  list(pcfg_grammar.values())
    # Building pcfg grammar 
    for rule in rules_distinct:
        pcfg_grammar_dict[rule[0]].add(rule[1])   
    unique_rules, distinct = np.unique(pcfg_list, return_counts=True)
    unique_rules = np.array([[rule.lhs(), rule.rhs()]  for rule in unique_rules])
    rules_matrix = np.hstack((unique_rules,distinct.astype(np.float64).reshape(-1,1)))
    NT_l, x = np.unique(rules_matrix[:, 0], return_inverse=True)
    NT_r, y = np.unique(rules_matrix[:, 1], return_inverse=True)
    l_size, r_size = len(NT_l),len(NT_r)
    pcfg= np.zeros((l_size, r_size))
    pcfg[x, y] = rules_matrix[:, 2]
    pcfg = pcfg / np.sum(pcfg,axis=1).reshape(-1,1)
    NT_lhs= {NT.symbol():idx for (idx,NT) in enumerate(NT_l)}
    NT_rhs = {(NT[0].symbol(),NT[1].symbol()):idx for idx,NT in enumerate(NT_r) if len(NT)>1}
    for (idx,NT) in enumerate(NT_r):
        if len(NT)==1:
            NT_rhs[NT[0].symbol()]=idx
    return pcfg_grammar,pcfg_grammar_dict,axioms ,pcfg,NT_lhs,NT_rhs
    
    
    
    

In [646]:
# In main
start = time.time()
lexical_grammar, probabilities_lexicon,pos2id = extract_lexicon(train_corpus) 
pcfg_grammar,pcfg_grammar_dict,axioms,pcfg,NT_lhs,NT_rhs  = PCFG_model(train_corpus) # pcfg_grammar_dict ~ 
end = time.time()
# this is ok
print('PCFG model finished in', end-start)

PCFG model finished in 9.752292156219482


In [666]:
# helpers for CYK
def build_binaries_unaries(pcfg_grammar_dict,NT_l,NT_r,pos2id):
    ### Reformat binaries and unaries
    binaries = {}
    for lhs in pcfg_grammar_dict.keys() :
        for rhs in pcfg_grammar_dict[lhs] :
            if not rhs in binaries.keys() : binaries[rhs] = set()
            binaries[rhs].add(lhs)
    left_binary = set([bi[0] for bi in binaries.keys()])
    right_binary = set([bi[1] for bi in binaries.keys()])
    
    unaries_target = set([target for target in NT_r.keys() if np.ndim(target)==0])
    binaries_target = set(NT_r.keys()) - set(unaries_target)
    
    binaries_init = defaultdict(set)
    binaries_inv = defaultdict(set)
    
    for NT,target in pcfg_grammar_dict.items():
        binaries_tg = target & binaries_target
        if binaries_tg:
            binaries_init.update({NT:binaries_tg})
    
    for NT,targets in binaries_init.items():
        for target in targets:
            binaries_inv[target].add(NT)
            
    ### reformat Non-terminal words
    new_nt_l =NT_l.copy()
    new_nt_l.update({nt:i+len(NT_l) for (nt,i) in pos2id.items()})
    new_nt_l_inv = {i:nt for (nt,i) in new_nt_l.items()}
    return   binaries_inv,new_nt_l
            



def get_word_tag_dict(lexical_grammar):
    word2pos = defaultdict(set)
    for pos,tokens in lexical_grammar.items():
        for word in tokens:
            word2pos[word].add(pos)
    return word2pos
    

def process_sentence(sentence,probabilities_lexicon,pos2id,word2id,word2pos):
    # keep this
    tokens = sentence.split(' ')
    p=len(tokens)
    score = [[{} for i in range(p+1)] for j in range(p+1)]
    score_left = [[set() for i in range(p+1)] for j in range(p+1)]
    score_right = [[set() for i in range(p+1)] for j in range(p+1)]
    
    for idx, word in enumerate(tokens):
        w = word.lower()
        for pos in word2pos[w]:#rule=POS, rule_to_word=lexicon_grammar_inv 
            score[idx][idx+1][pos] = probabilities_lexicon[pos2id[pos],word2id[word]] 
            if pos in left_dict: 
                score_left[idx][idx+1].add(pos)
            if pos in right_dict :
                score_right[idx][idx+1].add(pos)          
    return score, score_left,score_right    



def failure_msg(sentence):
    msg = '(SENT '
    
    for word in sentence[:-1]:
        msg+= '(NULL '+word+')'
    msg+= '(NULL '+sentence[-1]+')'+')'
    return msg



def reconstruct_tree(back_tags, start,end, tokens,axioms,score,NT,n):
    """
    Use dynamic programming to track back the tree : recursive implementation
    """
    if n==1:
        candidates = [score[start][end].get(c,0) for c in axioms]
        NT = axioms[np.argmax(np.array(candidates))]
        if 'SENT' not in NT: return failure_msg(tokens)
        msg = '(' + NT + ' ' + tokens[start] + ')'
        return msg
    
    if end == start +1:
        msg = '(' + NT + ' ' + tokens[start] + ')'
        return msg
       
        
    if end == n+start:
        candidates =np.array([c for c in back_tags[start][end].keys() if c in axioms])
        if not candidates: return failure_msg(tokens)
        best_axiom = candidates[np.argmax([score[start][end][k] for k in candidates])]
        limit,lhs, rhs = back_tags[start][end][best_axiom]
    
    else:
        limit,lhs, rhs = back_tags[start][end][NT]
     
    left_result = '(' + NT + ' ' + reconstruct_tree(back_tags, start,limit, tokens,axioms,score,lhs,n) 
    right_result = reconstruct_tree(back_tags, limit, end, tokens,axioms,score,rhs,n) + ')'
    msg = left_result + ' '+right_result 
    return msg
        
            
            
    
    
           
    
def unchomsky(parsing):
    tree = Tree.fromstring(parsing)
    tree.un_chomsky_normal_form()
    unchomsky_result = ' '.join(tree.pformat().split())
    return unchomsky_result
    

In [670]:
def CYK2(sentence,axioms, probabilities_lexicon,pos2id,word2id,lexical_grammar,pcfg,NT_lhs,NT_rhs,pcfg_grammar_dict):
    
    word2pos =  get_word_tag_dict(lexical_grammar)
    binaries_inv, nt_dict = build_binaries_unaries(pcfg_grammar_dict,NT_lhs,NT_rhs,pos2id)
    # not tokenized
    score, score_left,score_right = process_sentence(sentence,probabilities_lexicon,pos2id,word2id,word2pos)
    sentence= sentence.split(' ')
    n= len(sentence)
    back_tags =[[dict() for i in range(n+1)] for k in range(n+1)]# to check the length
    
    for w in range(2,n+1):
        for start in range(n+1-w):
            end= start+w
            for limit in range(start+1,end):
                ## O(n^3)
                left_rule_set = score[start][limit]
                right_rule_set = score[limit][end]
                #l = [(A,B) for (A,B) in product(score_left[start][limit] , score_right[limit][end])]
                #print(l)
                couples= set(flat_binary)
                #print('bi',couples)
                intersection_rules = set(product(score_left[start][limit], score_right[limit][end])) & set(binaries_inv)
                for (B,C) in intersection_rules:
                    for A in binaries_inv[(B,C)] :
                        proba = left_rule_set[B] * right_rule_set[C] * pcfg[NT_lhs[A]][NT_rhs[(B,C)]] # inspect dict_init_non_terminals and dict_target_non_terminals
                        if proba > score[start][end].get(A, 0.):
                            score[start][end][A] = proba
                            if A in left_dict : 
                                score_left[start][end].add(A)
                            if A in right_dict : 
                                score_right[start][end].add(A)
                            
                            back_tags[start,end,nt_dict[A]] = (limit,B,C) # inspect 
    
    start ,end,NT =0,n,'SENT'
    result = reconstruct_tree(back_tags, start,end, sentence,axioms,score,NT,n)
    normalized_result = unchomsky(result)
    return back_tags,normalized_result          
                            
                 
    
    

In [671]:
def display_results( vocab, w2embed,word2id,unigrams, bigrams,id2word_plyglot,axioms, probabilities_lexicon,pos2id,lexical_grammar,pcfg,NT_lhs,NT_rhs,pcfg_grammar_dict,sentence=None,testset=None,lev_neigh=10,embed_neigh=20):
    
    parsed_list=[]
    if sentence:
        print(sentence)
        print('oov =====')
        start =time.time()
        tokens = sentence.split(' ') 
        # tokenized
        # Don't split sentence
        
        replacement = OOV(sentence , vocab, w2embed,word2id,unigrams, bigrams,id2word_plyglot,lev_neigh,embed_neigh)
        #replacement =sentence
        print('replacement sentence',replacement)
        # replacement is not tokenized
        end = time.time()
        #replacement = sentence
        print('oov in =====',end-start)
        parsed = CYK2(replacement,axioms, probabilities_lexicon,pos2id,word2id,lexical_grammar,pcfg,NT_lhs,NT_rhs,pcfg_grammar_dict)
           #parsed = CYK2(replacement,pcfg,rule_to_word,binary_probs,axioms,left_dict, right_dict,couples)
        parsed_list.append(parsed)
    
    if testset:
        for sentence in tqdm(testset):
            start =time.time()
            print(sentence)
            print('oov =====')
            start =time.time()
            tokens = sentence.split(' ') 
            # tokenized
            # Don't split sentence
        
            replacement = OOV(sentence , vocab, w2embed,word2id,unigrams, bigrams,id2word_plyglot,lev_neigh,embed_neigh)
            #replacement =sentence
            print('replacement sentence',replacement)
            # replacement is not tokenized
            end = time.time()
            #replacement = sentence
            print('oov in =====',end-start)
            parsed = CYK2(replacement,axioms, probabilities_lexicon,pos2id,word2id,lexical_grammar,pcfg,NT_lhs,NT_rhs,pcfg_grammar_dict)
           #parsed = CYK2(replacement,pcfg,rule_to_word,binary_probs,axioms,left_dict, right_dict,couples)
            parsed_list.append(parsed)
    return parsed_list    

In [None]:
sentence0 = test_corpus[0]
display_results( vocab, w2embed,word2id,unigrams, bigrams,id2word_plyglot,axioms, probabilities_lexicon,pos2id,lexical_grammar,pcfg,NT_lhs,NT_rhs,pcfg_grammar_dict,sentence=sentence0,testset=None)
#display_results( axioms, prob_dict,pos2id,word2id,lexical_grammar,pcfg,NT_lhs,NT_rhs,sentence=sentence0,testset=None)


( (SENT (PONCT -) (NP (ADJ 19) (NC janvier) (NC 2004)) (PONCT :) (NP (DET le) (NC juge) (NPP Armand) (NPP Riberolles)) (VN (V clôt)) (NP (DET ses) (NC investigations) (COORD (CC et) (NP (DET l') (NC instruction) (PP (P de) (NP (DET l') (NC affaire) (PP (P+D des) (NP (NC HLM) (PP (P de) (NP (NPP Paris)))))))))) (PONCT .)))
oov =====
replacement sentence - 19 janvier 2004 : le juge Edmond ses investigations et l' instruction de l' affaire des HLM de Paris .
oov in ===== 62.73377203941345


***
***


***
***