In [1]:
import lftk
import polars as pl
import elfen
import spacy
import datasets
import time

ModuleNotFoundError: No module named 'lftk'

In [2]:
ds = datasets.load_dataset("stanfordnlp/sst2")

In [3]:
df_test = pl.from_pandas(ds["test"].to_pandas())[:100]

In [4]:
texts = df_test["sentence"].to_list()

# Extracting all features: Runtime

In [7]:
def benchmark_elfen(model: str,
                    df_test: pl.DataFrame,
                    text_column: str,
                    n_runs: int = 5):
    times = []
    for _ in range(n_runs):  # Run multiple times to get an average
        start_time = time.time()
        extractor = elfen.Extractor(data=df_test,
                                    text_column=text_column,
                                    model=model)
        extractor.extract_features()
        elfen_time = time.time() - start_time
        times.append(elfen_time)

    mean_elfen_time = sum(times) / len(times)
    std_elfen_time = (sum((x - mean_elfen_time) ** 2 for x in times) / len(times)) ** 0.5

    return mean_elfen_time, std_elfen_time

def benchmark_lftk(nlp_model: str,
                   texts: list[str],
                   n_runs: int = 5):
    nlp = spacy.load(nlp_model)

    times = []
    for _ in range(n_runs):  # Run multiple times to get an average
        start_time = time.time()
        nlp = spacy.load("en_core_web_md")
        docs = list(nlp.pipe(texts, batch_size=50))

        LFTK = lftk.Extractor(docs=docs)

        features = lftk.search_features(domain="surface")
        features += lftk.search_features(domain="syntax")
        features += lftk.search_features(domain="discourse")
        features += lftk.search_features(domain="lexico-semantics")
        features = [f["key"] for f in features]

        feats = LFTK.extract(features=features)

        # turning the list of dicts into a polars DataFrame
        feats = pl.DataFrame(feats)
        lftk_time = time.time() - start_time
        times.append(lftk_time)

    mean_lftk_time = sum(times) / len(times)
    std_lftk_time = (sum((x - mean_lftk_time) ** 2 for x in times) / len(times)) ** 0.5

    return mean_lftk_time, std_lftk_time

## en_core_web_sm

In [8]:
mean_elfen_time, std_elfen_time = benchmark_elfen(model="en_core_web_sm",
                                                  df_test=df_test,
                                                  text_column="sentence",
                                                  n_runs=5)

Extracting raw_sequence_length...
Extracting n_tokens...
Extracting n_sentences...
Extracting n_tokens_per_sentence...
Extracting n_characters...
Extracting avg_word_length...
Extracting n_types...
Extracting n_long_words...
Extracting n_lemmas...
Extracting n_per_morph_feature...
Extracting tree_width...
Extracting tree_depth...
Extracting tree_branching...
Extracting n_noun_chunks...
Extracting n_per_dependency_type...
Extracting sentiment_score...
Extracting n_negative_sentiment...
Extracting n_positive_sentiment...
Extracting avg_valence...
Extracting n_low_valence...
Extracting n_high_valence...
Extracting avg_arousal...
Extracting n_low_arousal...
Extracting n_high_arousal...
Extracting avg_dominance...
Extracting n_low_dominance...
Extracting n_high_dominance...
Extracting avg_emotion_intensity...
Extracting n_low_intensity...
Extracting n_high_intensity...
Extracting compressibility...
Extracting entropy...
Extracting lemma_token_ratio...
Extracting ttr...
Extracting rttr...
Ex

In [9]:
mean_elfen_time, std_elfen_time

(11.425279378890991, 0.5091954852200966)

In [11]:
mean_lftk_time, std_lftk_time = benchmark_lftk(nlp_model="en_core_web_sm",
                                                texts=texts,
                                                n_runs=5)
mean_lftk_time, std_lftk_time

(15.45682406425476, 0.021962778859120097)

## en_core_web_md

In [12]:
mean_elfen_time, std_elfen_time = benchmark_elfen(model="en_core_web_md",
                                                  df_test=df_test,
                                                  text_column="sentence",
                                                  n_runs=5)
mean_lftk_time, std_lftk_time = benchmark_lftk(nlp_model="en_core_web_md",
                                                texts=texts,
                                                n_runs=5)
print(f"Elfen (en_core_web_md): {mean_elfen_time:.2f} ± {std_elfen_time:.2f} seconds")
print(f"LFTK (en_core_web_md): {mean_lftk_time:.2f} ± {std_lftk_time:.2f} seconds")

Extracting raw_sequence_length...
Extracting n_tokens...
Extracting n_sentences...
Extracting n_tokens_per_sentence...
Extracting n_characters...
Extracting avg_word_length...
Extracting n_types...
Extracting n_long_words...
Extracting n_lemmas...
Extracting n_per_morph_feature...
Extracting tree_width...
Extracting tree_depth...
Extracting tree_branching...
Extracting n_noun_chunks...
Extracting n_per_dependency_type...
Extracting sentiment_score...
Extracting n_negative_sentiment...
Extracting n_positive_sentiment...
Extracting avg_valence...
Extracting n_low_valence...
Extracting n_high_valence...
Extracting avg_arousal...
Extracting n_low_arousal...
Extracting n_high_arousal...
Extracting avg_dominance...
Extracting n_low_dominance...
Extracting n_high_dominance...
Extracting avg_emotion_intensity...
Extracting n_low_intensity...
Extracting n_high_intensity...
Extracting compressibility...
Extracting entropy...
Extracting lemma_token_ratio...
Extracting ttr...
Extracting rttr...
Ex

## en_core_web_lg

In [13]:
mean_elfen_time, std_elfen_time = benchmark_elfen(model="en_core_web_lg",
                                                  df_test=df_test,
                                                  text_column="sentence",
                                                  n_runs=5)
mean_lftk_time, std_lftk_time = benchmark_lftk(nlp_model="en_core_web_lg",
                                                texts=texts,
                                                n_runs=5)
print(f"Elfen (en_core_web_lg): {mean_elfen_time:.2f} ± {std_elfen_time:.2f} seconds")
print(f"LFTK (en_core_web_lg): {mean_lftk_time:.2f} ± {std_lftk_time:.2f} seconds")

Extracting raw_sequence_length...
Extracting n_tokens...
Extracting n_sentences...
Extracting n_tokens_per_sentence...
Extracting n_characters...
Extracting avg_word_length...
Extracting n_types...
Extracting n_long_words...
Extracting n_lemmas...
Extracting n_per_morph_feature...
Extracting tree_width...
Extracting tree_depth...
Extracting tree_branching...
Extracting n_noun_chunks...
Extracting n_per_dependency_type...
Extracting sentiment_score...
Extracting n_negative_sentiment...
Extracting n_positive_sentiment...
Extracting avg_valence...
Extracting n_low_valence...
Extracting n_high_valence...
Extracting avg_arousal...
Extracting n_low_arousal...
Extracting n_high_arousal...
Extracting avg_dominance...
Extracting n_low_dominance...
Extracting n_high_dominance...
Extracting avg_emotion_intensity...
Extracting n_low_intensity...
Extracting n_high_intensity...
Extracting compressibility...
Extracting entropy...
Extracting lemma_token_ratio...
Extracting ttr...
Extracting rttr...
Ex

## en_core_web_trf

In [14]:
mean_elfen_time, std_elfen_time = benchmark_elfen(model="en_core_web_trf",
                                                  df_test=df_test,
                                                  text_column="sentence",
                                                  n_runs=5)
mean_lftk_time, std_lftk_time = benchmark_lftk(nlp_model="en_core_web_trf",
                                                texts=texts,
                                                n_runs=5)
print(f"Elfen (en_core_web_trf): {mean_elfen_time:.2f} ± {std_elfen_time:.2f} seconds")
print(f"LFTK (en_core_web_trf): {mean_lftk_time:.2f} ± {std_lftk_time:.2f} seconds")

Extracting raw_sequence_length...
Extracting n_tokens...
Extracting n_sentences...
Extracting n_tokens_per_sentence...
Extracting n_characters...
Extracting avg_word_length...
Extracting n_types...
Extracting n_long_words...
Extracting n_lemmas...
Extracting n_per_morph_feature...
Extracting tree_width...
Extracting tree_depth...
Extracting tree_branching...
Extracting n_noun_chunks...
Extracting n_per_dependency_type...
Extracting sentiment_score...
Extracting n_negative_sentiment...
Extracting n_positive_sentiment...
Extracting avg_valence...
Extracting n_low_valence...
Extracting n_high_valence...
Extracting avg_arousal...
Extracting n_low_arousal...
Extracting n_high_arousal...
Extracting avg_dominance...
Extracting n_low_dominance...
Extracting n_high_dominance...
Extracting avg_emotion_intensity...
Extracting n_low_intensity...
Extracting n_high_intensity...
Extracting compressibility...
Extracting entropy...
Extracting lemma_token_ratio...
Extracting ttr...
Extracting rttr...
Ex