In [1]:
import numpy as np
import scipy.stats
from typing import List
from stemmabench.config_parser import ProbabilisticConfig

In [2]:
def repeat_character(n:int=10, char:str="word", sep:str=" "):
    """
    Generate a fake text.
    """
    return sep.join([f"{char}{i}"  for i in range(1, n + 1)])

DEMO_TEXT = repeat_character(25)
print(DEMO_TEXT)

word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12 word13 word14 word15 word16 word17 word18 word19 word20 word21 word22 word23 word24 word25


In [4]:
def fragment(text: str, 
             max_frag_rate: float, 
             frag_loc_dist: ProbabilisticConfig={
                 "law": "Binomial",
                 "rate": 0.5
             }, 
             sep: str=" ",
             random_state=None
) -> str:
    """
    Fragment a given text by randomly removing words.

    Args:
        text (str): The input text to be fragmented.
        max_frag_rate (float): The maximum allowable word removal rate.
        frag_loc_dist (dict): Dictionary specifying the distribution of fragment locations.
            "law" (str): Distribution type ("Binomial", "Uniform", "Poisson").
            "rate" (float): Distribution parameter determining the probability.
        sep (str, optional): The separator used to split the input text into words. Default is a space.
        random_state (int or None, optional): Seed for random number generation. Default is None.

    Returns:
        str: A fragmented text with words removed.
    """
    # Check if the fragmentation rate is valid.
    if not 0 <= max_frag_rate <= 1:
        raise ValueError("Maximum fragmentation rate must be between 0 and 1.")
    
    # Initialize a random number generator.
    rng = np.random.default_rng(random_state)
    
    # Split the text into a list of words and get total word count.
    words = text.split(sep)
    n_words = len(words)
    indices = range(n_words)

    # Generate a distribution for word indices based on the specified law.
    if frag_loc_dist["law"] == "Binomial":
        locations_dist = scipy.stats.binom.pmf(k=indices, n=n_words, 
                                      p=frag_loc_dist["rate"])
    elif frag_loc_dist["law"] == "Uniform":
        locations_dist = np.full(shape=n_words, fill_value=1/n_words)
    elif frag_loc_dist["law"] == "Poisson":
        locations_dist = scipy.stats.poisson.pmf(k=indices, 
                                                 mu=frag_loc_dist["rate"])
    else:
        raise ValueError("Only 'Binomial', 'Uniform', and 'Poisson' laws are \
                         supported.")
    locations_dist = locations_dist / locations_dist.sum()
    # Calculate the number of fragment locations based on the fragment rate.
    frag_rate = rng.uniform(low=0, high=max_frag_rate)
    n_frag_loc = int(frag_rate * n_words) 

    # Choose 'n_frag_loc' fragment locations according to 'locations_dist'.
    frag_locations = rng.choice(indices, size=n_frag_loc, replace=False, 
                                p=locations_dist)
    
    # Remove words at the selected fragment locations.
    words = np.delete(words, frag_locations)
    
    # Join the remaining words to form the fragmented text.
    fragmented_text = sep.join(words)
    
    return fragmented_text


# Tests : different rate, different distribution Binomial, Uniform, Poisson and an other external
rates = [-1, 0, 0.25, 0.5, 0.75, 1, 2]
for rate in rates:
    try:
        res = fragment(DEMO_TEXT, max_frag_rate=rate)
        length = len(res.split(" ")) # approx
        print(f"{rate} - {length} sentences - {res}")
    except:
        print(rate, "Error")
        pass

-1 Error
0 - 25 sentences - word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12 word13 word14 word15 word16 word17 word18 word19 word20 word21 word22 word23 word24 word25
0.25 - 20 sentences - word1 word2 word3 word4 word5 word6 word7 word8 word10 word11 word14 word16 word17 word19 word20 word21 word22 word23 word24 word25
0.5 - 17 sentences - word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word18 word21 word22 word23 word24 word25
0.75 - 24 sentences - word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word13 word14 word15 word16 word17 word18 word19 word20 word21 word22 word23 word24 word25
1 - 8 sentences - word1 word2 word3 word4 word5 word23 word24 word25
2 Error


In [30]:
from stemmabench.textual_units.word import Word
from stemmabench.textual_units.text import Text
textstr = f"I am {None} going to run. Now!"
text = Text(textstr)

In [38]:
import sys
from stemmabench.stemma_generator import Stemma
from stemmabench.config_parser import StemmaBenchConfig
from loguru import logger
# Set logging level to info
logger.remove()
logger.add(sys.stderr, level="INFO")

1

In [41]:
config = StemmaBenchConfig(**{
    "meta": {
      "language": "eng"  
    },

    "stemma": {
        "depth": 3,
        "width": {
            "law": "Uniform",
            "min": 2,
            "max": 4
        },
        "fragmentation_proba": 1
    },

    "variants": {
        "sentences": {
            "duplicate": {
                "args": {
                    "nbr_words": 1
                },
                "law": "Bernouilli",
                "rate": 0.5
            }
        },
        "words": {
            "synonym": {
                "law": "Bernouilli",
                "rate": 0.05,
                "args": {}
            },
            "mispell": {
                "law": "Bernouilli",
                "rate": 0.001,
                "args": {}
            },
            "omit": {
                "law": "Bernouilli",
                "rate": 0.001,
                "args": {}
            }
        },
        "text": {
            "fragmentation": {
                "max_rate": 1,
                "distribution": {
                    "law": "Bernouilli",
                    "rate": 0.5
                }
            }
        }
    }
})

In [42]:
# Instantiate a Stemma object.
stemma = Stemma(original_text=DEMO_TEXT, config=config)

# Generate a tradition.
stemma.generate()

Tree({
  "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12 word13 word14 word15 word16 word17 word18 word19 word20 word21 word22 word23 word24 word25": {
    "Word1 word2 word3 word4 word5 word6  word8 word9 word10 word11 word12 word13 word14 word15 word16 word17 word17 word18 word19 word20 word21 word22 word23 word24 word25.": {
      "Word1 word2 word3 word4 word5 word6 word8 word9 word10 word11 word12 word13 word14 word15 word16 word17 word17 word18 word19 word20 word21 word22 word23 word24 word25.": {
        "Word1 word2 word3 word4 word5 word6 word8 word9 word10 word11 word12 word13 word14 word15 word16 word17 word17 word18 word19 word20 word21 word22 word23 word24 word25.": [
          "Word1 word1 word2 word3 word4 word5 word6 word8 word9 word10 word11 word12 word13 word14 word15 word16 word17 word17 word18 word19 word20 word21 word22 word23 word24 word25.",
          "Word1 word2 word3 word4 word5 word6 word8 word9 word10 word11 word12 word13 word14 w

In [None]:
# Access generate texts.
stemma.texts_lookup

In [None]:
# Access edges.
stemma.edges