In [1]:
import json, warnings

from typing import Union

import numpy as np
import pandas as pd
import scipy

from syntaxcomp import TextComplexity

In [4]:
with open("../data/data_dict_conll_spacy.json", 'r', encoding="utf8") as inp:
    data_dict_conll = json.load(inp)

In [3]:
with open("../data/data_dict.json", 'r', encoding="utf8") as inp:
    data_dict = json.load(inp)

In [None]:
comp = TextComplexity(data_dict_conll["BartDG"][0])

In [5]:
print(comp.info(print_=False))

{'Number of Sentences': 1, 'Number of Words': 5, 'Number of Clauses': 1, 'Number of T-Units': 1, 'Mean Sentence Length': 5.0, 'Mean Clause Length': 5.0, 'Mean T-Unit Length': 5.0, 'Mean Number of Clauses per Sentence': 1.0, 'Mean Number of Clauses per T-Unit': 1.0, 'Mean Tree Depth': 3, 'Median Tree Depth': 3, 'Minimum Tree Depth': 3, 'Maximum Tree Depth': 3, 'Mean Dependency Distance': 1.8, 'Node-to-Terminal-Node Ratio': 1.67, 'Average Levenshtein Distance between POS': 0, 'Average Levenshtein Distance between deprel': 0, 'Average NP Length': 1.5, 'Complex NP Ratio': 0.5, 'Number of Combined Clauses': 0, 'Number of Coordinate Clauses': 0, 'Number of Subordinate Clauses': 0, 'Coordinate to Combined Clause Ratio': 0, 'Subordinate to Combined Clause Ratio': 0, 'Coordinate to Subordinate Clause Ratio': 0, 'Coordinate Clause to Sentence Ratio': 0.0, 'Subordinate Clause to Sentence Ratio': 0.0, 'ROOT_ratio': 1.0, 'root_ratio': 0.0, 'acl_ratio': 0.0, 'acl:relcl_ratio': 0.0, 'advcl_ratio': 0.

In [6]:
def analyze(s: str) -> dict[str, Union[float, int]]:
    result = TextComplexity(s).info(print_=False)
    return result

In [7]:
data_with_syn_scores = {key: [] for key in data_dict}

for key in data_dict:
    distractor_set_id = 0
    distractor_id = 0
    for distractor_set in data_dict[key]:
        for d_id_inset, distractor in enumerate(distractor_set):
            distractor_parse = data_dict_conll[key][distractor_id]
            try:
                item = {
                    "distractor_set_id": distractor_set_id,
                    "distractor_id": distractor_id,
                    "distractor_text": distractor,
                    "distractor_parse": distractor_parse,
                }
                if distractor_parse:
                    try:
                        item = {**item, **analyze(distractor_parse)}
                    except Exception as exc:
                        warnings.warn(f"{key}, {distractor_set_id}, {d_id_inset}, {distractor}\n{str(exc)}")
                data_with_syn_scores[key].append(item)
            except Exception as exc:
                print(key, distractor_set_id, d_id_inset, distractor_id, distractor)
                raise exc
            distractor_id += 1
        distractor_set_id += 1

The annotation is empty!
mean requires at least one data point
mean requires at least one data point
mean requires at least one data point
mean requires at least one data point
mean requires at least one data point
mean requires at least one data point
mean requires at least one data point
mean requires at least one data point
mean requires at least one data point


In [8]:
## Dump data to different sheets:
writer = pd.ExcelWriter(
    "../data/syntaxcomp_metrics_spacy.xlsx",
    engine="xlsxwriter"
)

for key in data_with_syn_scores:
    df = pd.DataFrame(data_with_syn_scores[key])
    df.to_excel(writer, sheet_name=key)

writer.close()

In [None]:
## Dump data to one sheet
data_with_scores_ = []

for key in data_with_syn_scores:
    df = [
        {"Source": key, **item} for item in data_with_syn_scores[key]
    ]
    data_with_scores_ += df

data_with_scores_ = pd.DataFrame(data_with_scores_)
data_with_scores_.head()

In [9]:
## Compute averaged table
averaged_table = None

for key in data_with_syn_scores:
    df = pd.DataFrame(data_with_syn_scores[key])

    # print(key, "Number of words between:", df["Number of Words"].min(), df["Number of Words"].max())
    # print(key, "Number of sents between:", df["Number of Sentences"].min(), df["Number of Sentences"].max())
    # print(key, "Number of clauses between:", df["Number of Clauses"].min(), df["Number of Clauses"].max())
    # print(key, "Number of T-units between:", df["Number of T-Units"].min(), df["Number of T-Units"].max())

    row = df.drop(
        [
            "distractor_set_id",
            "distractor_id",
            "distractor_text",
            "distractor_parse"
        ], axis="columns"
    ).mean()
    if averaged_table is None:
        averaged_table = pd.DataFrame(
            columns = [col for col in row.keys()] + ["Distractors/Question"]
        )
    row["Distractors/Question"] = df["Number of Clauses"].notna().sum() / 55
    averaged_table.loc[key] = row

In [10]:
averaged_table.T

Unnamed: 0,BartDG,BartDG_PM,BartDG_ANPM,MuSeRC_GPT3,MuSeRC_T5,RuRace_GPT3,RuRace_T5,Deepseek,ChatGPT4o,true_distractors
Number of Sentences,1.036145,1.060241,1.036145,1.006494,1.011765,1.0,1.0,1.0,1.0,1.006024
Number of Words,7.927711,9.138554,8.753012,5.324675,7.447059,9.641026,8.69375,13.036364,9.460606,12.060241
Number of Clauses,1.46988,1.837349,1.891566,1.525974,1.682353,1.801282,1.85625,2.254545,1.745455,2.192771
Number of T-Units,1.138554,1.144578,1.072289,1.136364,1.117647,1.102564,1.14375,1.4,1.169697,1.319277
Mean Sentence Length,7.798193,8.750482,8.65259,5.282468,7.311765,9.641026,8.69375,13.036364,9.460606,11.996988
Mean Clause Length,5.86753,5.630964,5.274036,3.668896,4.814824,6.195897,5.273,6.669636,6.128303,6.478133
Mean T-Unit Length,7.245,8.300663,8.438735,4.781364,6.758824,9.100449,7.971875,10.387879,8.469697,9.642108
Mean Number of Clauses per Sentence,1.430723,1.754518,1.855422,1.512987,1.652941,1.801282,1.85625,2.254545,1.745455,2.180723
Mean Number of Clauses per T-Unit,1.303193,1.647048,1.801205,1.361429,1.505882,1.658141,1.671875,1.729273,1.536364,1.697771
Mean Tree Depth,3.719398,3.960843,4.068253,3.081169,3.735294,4.121795,3.81875,4.89697,4.266667,4.674699


In [11]:
for key, val in data_dict.items():
    print(key, len(val))

BartDG 55
BartDG_PM 55
BartDG_ANPM 55
MuSeRC_GPT3 55
MuSeRC_T5 55
RuRace_GPT3 55
RuRace_T5 55
Deepseek 55
ChatGPT4o 55
true_distractors 55


In [12]:
averaged_table.T.to_excel(
    "../data/avg_syntaxcomp_metrics_spacy.xlsx", float_format="%.2f"
)

Let's build confidence intervals for the mean using the T-distribution technique:

In [17]:
## Compute averaged table
averaged_table_wconf = []

for key in data_with_syn_scores:
    df = pd.DataFrame(data_with_syn_scores[key])

    df = df.drop(
        [
            "distractor_set_id",
            "distractor_id",
            "distractor_text",
            "distractor_parse"
        ], axis="columns"
    )
    row = dict()
    row["source"] = key

    for key in df.columns:
        d = df[key]
        m, s, n = np.mean(d), np.std(d, ddof=1), len(d)  # Mean, SD, Size
        t = scipy.stats.t.ppf(0.975, df=n-1)  # t-value
        e = t * (s / np.sqrt(n))  # Margin

        row[key] = f"{m.__format__('.2f')} ± {e.__format__('.2f')}"
    
    row["D/Q"] = df["Number of Clauses"].notna().sum() / 55

    averaged_table_wconf.append(row)

averaged_table_wconf = pd.DataFrame(averaged_table_wconf).set_index("source")

In [18]:
table4 = averaged_table_wconf.T
table4

source,BartDG,BartDG_PM,BartDG_ANPM,MuSeRC_GPT3,MuSeRC_T5,RuRace_GPT3,RuRace_T5,Deepseek,ChatGPT4o,true_distractors
Number of Sentences,1.04 ± 0.04,1.06 ± 0.05,1.04 ± 0.04,1.01 ± 0.01,1.01 ± 0.02,1.00 ± 0.00,1.00 ± 0.00,1.00 ± 0.00,1.00 ± 0.00,1.01 ± 0.01
Number of Words,7.93 ± 0.46,9.14 ± 0.56,8.75 ± 0.49,5.32 ± 0.49,7.45 ± 0.89,9.64 ± 0.51,8.69 ± 0.45,13.04 ± 0.44,9.46 ± 0.37,12.06 ± 0.76
Number of Clauses,1.47 ± 0.10,1.84 ± 0.14,1.89 ± 0.15,1.53 ± 0.13,1.68 ± 0.19,1.80 ± 0.15,1.86 ± 0.12,2.25 ± 0.14,1.75 ± 0.12,2.19 ± 0.19
Number of T-Units,1.14 ± 0.07,1.14 ± 0.07,1.07 ± 0.05,1.14 ± 0.06,1.12 ± 0.07,1.10 ± 0.05,1.14 ± 0.05,1.40 ± 0.08,1.17 ± 0.06,1.32 ± 0.09
Mean Sentence Length,7.80 ± 0.46,8.75 ± 0.48,8.65 ± 0.49,5.28 ± 0.48,7.31 ± 0.82,9.64 ± 0.51,8.69 ± 0.45,13.04 ± 0.44,9.46 ± 0.37,12.00 ± 0.75
Mean Clause Length,5.87 ± 0.33,5.63 ± 0.34,5.27 ± 0.31,3.67 ± 0.27,4.81 ± 0.55,6.20 ± 0.40,5.27 ± 0.37,6.67 ± 0.44,6.13 ± 0.34,6.48 ± 0.44
Mean T-Unit Length,7.25 ± 0.40,8.30 ± 0.45,8.44 ± 0.48,4.78 ± 0.42,6.76 ± 0.76,9.10 ± 0.52,7.97 ± 0.45,10.39 ± 0.59,8.47 ± 0.37,9.64 ± 0.56
Mean Number of Clauses per Sentence,1.43 ± 0.10,1.75 ± 0.13,1.86 ± 0.15,1.51 ± 0.12,1.65 ± 0.18,1.80 ± 0.15,1.86 ± 0.12,2.25 ± 0.14,1.75 ± 0.12,2.18 ± 0.19
Mean Number of Clauses per T-Unit,1.30 ± 0.07,1.65 ± 0.12,1.80 ± 0.14,1.36 ± 0.10,1.51 ± 0.15,1.66 ± 0.13,1.67 ± 0.11,1.73 ± 0.12,1.54 ± 0.10,1.70 ± 0.13
Mean Tree Depth,3.72 ± 0.13,3.96 ± 0.15,4.07 ± 0.17,3.08 ± 0.16,3.74 ± 0.25,4.12 ± 0.16,3.82 ± 0.14,4.90 ± 0.17,4.27 ± 0.13,4.67 ± 0.19


In [19]:
table4.to_excel("../data/avg_syntaxcomp_metrics_spacy_CI.xlsx")
table4.to_excel("../data/table4.xlsx")

Ans using the Bootstrap technique:

In [20]:
## Compute averaged table
averaged_table_wconf_ = []

for key in data_with_syn_scores:
    df = pd.DataFrame(data_with_syn_scores[key])

    df = df.drop(
        [
            "distractor_set_id",
            "distractor_id",
            "distractor_text",
            "distractor_parse"
        ], axis="columns"
    )
    row = dict()
    row["source"] = key

    for key in df.columns:
        bootstrap_result = scipy.stats.bootstrap((df[key].dropna(), ), np.mean, method="basic", rng=42)
        m = df[key].mean()
        l = bootstrap_result.confidence_interval.low
        h = bootstrap_result.confidence_interval.high

        row[key] = f"{l.__format__('.2f')} < {m.__format__('.2f')} > {h.__format__('.2f')}"

    averaged_table_wconf_.append(row)

averaged_table_wconf_ = pd.DataFrame(averaged_table_wconf_).set_index("source")

In [21]:
averaged_table_wconf_.T

source,BartDG,BartDG_PM,BartDG_ANPM,MuSeRC_GPT3,MuSeRC_T5,RuRace_GPT3,RuRace_T5,Deepseek,ChatGPT4o,true_distractors
Number of Sentences,0.99 < 1.04 > 1.07,1.01 < 1.06 > 1.10,0.99 < 1.04 > 1.07,0.99 < 1.01 > 1.01,0.99 < 1.01 > 1.02,1.00 < 1.00 > 1.00,1.00 < 1.00 > 1.00,1.00 < 1.00 > 1.00,1.00 < 1.00 > 1.00,0.99 < 1.01 > 1.01
Number of Words,7.46 < 7.93 > 8.39,8.57 < 9.14 > 9.69,8.27 < 8.75 > 9.23,4.82 < 5.32 > 5.79,6.54 < 7.45 > 8.31,9.12 < 9.64 > 10.13,8.24 < 8.69 > 9.14,12.59 < 13.04 > 13.47,9.08 < 9.46 > 9.82,11.31 < 12.06 > 12.79
Number of Clauses,1.36 < 1.47 > 1.57,1.69 < 1.84 > 1.98,1.74 < 1.89 > 2.04,1.40 < 1.53 > 1.65,1.48 < 1.68 > 1.87,1.65 < 1.80 > 1.94,1.74 < 1.86 > 1.97,2.12 < 2.25 > 2.39,1.63 < 1.75 > 1.85,2.00 < 2.19 > 2.39
Number of T-Units,1.07 < 1.14 > 1.20,1.07 < 1.14 > 1.20,1.02 < 1.07 > 1.11,1.06 < 1.14 > 1.19,1.05 < 1.12 > 1.18,1.05 < 1.10 > 1.15,1.09 < 1.14 > 1.19,1.32 < 1.40 > 1.48,1.11 < 1.17 > 1.22,1.23 < 1.32 > 1.40
Mean Sentence Length,7.35 < 7.80 > 8.24,8.27 < 8.75 > 9.23,8.16 < 8.65 > 9.14,4.79 < 5.28 > 5.74,6.48 < 7.31 > 8.11,9.12 < 9.64 > 10.13,8.24 < 8.69 > 9.14,12.59 < 13.04 > 13.47,9.08 < 9.46 > 9.82,11.25 < 12.00 > 12.71
Mean Clause Length,5.54 < 5.87 > 6.19,5.28 < 5.63 > 5.97,4.96 < 5.27 > 5.58,3.39 < 3.67 > 3.94,4.24 < 4.81 > 5.33,5.79 < 6.20 > 6.58,4.91 < 5.27 > 5.63,6.23 < 6.67 > 7.09,5.79 < 6.13 > 6.46,6.04 < 6.48 > 6.90
Mean T-Unit Length,6.86 < 7.25 > 7.64,7.85 < 8.30 > 8.75,7.96 < 8.44 > 8.92,4.34 < 4.78 > 5.18,5.99 < 6.76 > 7.50,8.57 < 9.10 > 9.60,7.52 < 7.97 > 8.42,9.78 < 10.39 > 10.96,8.10 < 8.47 > 8.84,9.08 < 9.64 > 10.17
Mean Number of Clauses per Sentence,1.33 < 1.43 > 1.52,1.62 < 1.75 > 1.88,1.71 < 1.86 > 2.00,1.38 < 1.51 > 1.63,1.47 < 1.65 > 1.83,1.65 < 1.80 > 1.94,1.74 < 1.86 > 1.97,2.12 < 2.25 > 2.39,1.63 < 1.75 > 1.85,1.99 < 2.18 > 2.37
Mean Number of Clauses per T-Unit,1.23 < 1.30 > 1.37,1.53 < 1.65 > 1.76,1.66 < 1.80 > 1.94,1.26 < 1.36 > 1.46,1.35 < 1.51 > 1.65,1.53 < 1.66 > 1.78,1.56 < 1.67 > 1.78,1.60 < 1.73 > 1.85,1.43 < 1.54 > 1.64,1.56 < 1.70 > 1.83
Mean Tree Depth,3.59 < 3.72 > 3.85,3.82 < 3.96 > 4.11,3.90 < 4.07 > 4.23,2.92 < 3.08 > 3.24,3.48 < 3.74 > 3.98,3.96 < 4.12 > 4.28,3.68 < 3.82 > 3.96,4.73 < 4.90 > 5.05,4.13 < 4.27 > 4.39,4.48 < 4.67 > 4.86
