In [None]:
## import
from mwe_discov_eval.utils import ConllIterator
from mwe_discov_eval.databases import NgramCounter, NgramDb, EmbeddingsDb
from mwe_discov_eval.measures.am_extended import pmi, dice
from mwe_discov_eval.measures.lexical_predictability_ratio import *
from mwe_discov_eval.measures import AMExtendedCalc, FairDispPointCalc, CValueCalc, SynEntCalc, CompCalc
from mwe_discov_eval.measures.am_bigrams import pmi as pmi_1
from mwe_discov_eval.measures.am_bigrams import dice as dice_1

In [None]:
## Creates an object (ConllIterator) that iterates over sentences of a ConLL file.

corpus = "samples/corpora/sample_corpus.conllu"
# dictionary with features and their position in the ConLL lines.
# format: {<feature, str>: <position, int>}  
idx_dict = {"id": 0, "surface": 1, "lemma": 2, "pos": 3}
sentences = ConllIterator(corpus, idx_dict, codec='utf8')
# set_itermode changes the behaviour of the iterator.
sentences.set_itermode('sent', keys=["surface", "lemma", "pos"], ignore_compound=True)

In [None]:
## Prints first 100 elements returned by the iterator

sentences.sample()

In [None]:
## Counts occurrences of single words and of all ngrams in the corpus composed of 2 to 5 words

counter = NgramCounter('samples/counters/sample_counter')
counter.count_ngrams(sentences, [1,2,3,4,5])

In [None]:
## Aggregate counts according to the lemma+pos

counter = NgramCounter.load('samples/counters/sample_counter')
counter.aggregate_by('lemma_pos')

In [None]:
## Creates an ngram_db from the counter to compute MWE discovery metrics

counter = NgramCounter.load('samples/counters/sample_counter')
ngram_db = NgramDb.from_NgramCounter(counter, 'samples/databases/sample_db')

In [None]:
# Computes association metrics (AM)

ngram_db = NgramDb.load('samples/databases/sample_db')
calc = AMExtendedCalc([pmi, dice])
calc.compute(ngram_db, 'lemma_pos_counts', 'lemma_pos')

In [None]:
ngram_db = NgramDb.load('samples/databases/sample_db')
calc = FairDispPointCalc([pmi_1, dice_1])
calc.compute(ngram_db, 'lemma_pos_counts', 'lemma_pos')

In [None]:
ngram_db = NgramDb.load('samples/databases/sample_db')
calc = CValueCalc()
calc.compute(ngram_db, 'lemma_pos_counts', 'lemma_pos')

In [None]:
ngram_db = NgramDb.load('samples/databases/sample_db')
calc = SynEntCalc()
calc.compute(ngram_db, 'lemma_pos_counts', 'lemma_pos', 'ngram_counts')

In [None]:
ngram_db = NgramDb.load('samples/databases/sample_db')
embedings_db = EmbeddingsDb.load("samples/databases/alacarte_emb")
calc = CompCalc()
calc.compute(ngram_db, embedings_db, 'lemma_pos_counts', 'lemma_pos')

In [None]:
ngram_db = NgramDb.load('samples/databases/sample_db')
calc = LPRCalc()
calc.compute(ngram_db, 'lemma_pos_counts', 'lemma_pos', save_every=-1)

In [None]:
ngram_db = NgramDb.load('samples/databases/sample_db')
calc = PredCalc()
calc.compute(ngram_db, 'lemma_pos_counts', 'lemma_pos', save_every=-1)