In [1]:
import pandas as pd
import numpy as np

from scipy.stats import (
    boxcox, f_oneway, normaltest, shapiro, yeojohnson, zscore, gzscore
)

In [2]:
data = dict()

fn = "../data/syntaxcomp_metrics_spacy.xlsx"
for sheet in pd.ExcelFile(fn).sheet_names:
    data[sheet] = pd.read_excel(fn, sheet_name=sheet, index_col="Unnamed: 0")

In [3]:
SOURCE_NAMES = [source for source in data.keys()]
COLUMN_NAMES = [col for col in data[SOURCE_NAMES[0]] if not col.startswith(
    "distractor"
)]

In [4]:
COLUMN_NAMES

['Number of Sentences',
 'Number of Words',
 'Number of Clauses',
 'Number of T-Units',
 'Mean Sentence Length',
 'Mean Clause Length',
 'Mean T-Unit Length',
 'Mean Number of Clauses per Sentence',
 'Mean Number of Clauses per T-Unit',
 'Mean Tree Depth',
 'Median Tree Depth',
 'Minimum Tree Depth',
 'Maximum Tree Depth',
 'Mean Dependency Distance',
 'Node-to-Terminal-Node Ratio',
 'Average Levenshtein Distance between POS',
 'Average Levenshtein Distance between deprel',
 'Average NP Length',
 'Complex NP Ratio',
 'Number of Combined Clauses',
 'Number of Coordinate Clauses',
 'Number of Subordinate Clauses',
 'Coordinate to Combined Clause Ratio',
 'Subordinate to Combined Clause Ratio',
 'Coordinate to Subordinate Clause Ratio',
 'Coordinate Clause to Sentence Ratio',
 'Subordinate Clause to Sentence Ratio',
 'ROOT_ratio',
 'root_ratio',
 'acl_ratio',
 'acl:relcl_ratio',
 'advcl_ratio',
 'advcl:relcl_ratio',
 'ccomp_ratio',
 'csubj_ratio',
 'csubj:outer_ratio',
 'nsubj:outer_ratio

In [5]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: normaltest(data[key][col].dropna()).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  b2 = skew(a, axis, _no_deco=True)
  b2 = kurtosis(a, axis, fisher=False, _no_deco=True)


Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,1.1e-05,0.0,0.0,0.00022,0.000166,0.04029,0.0,0.0,0.270824,0.222088,0.285255,0.19948,0.010015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,1e-06,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_PM,0.0,0.0,1e-06,0.0,0.0,0.0004,7.3e-05,0.0,0.0,0.078282,0.131588,0.103734,0.054752,0.001481,0.0,0.0,0.0,0.0,0.000179,1e-06,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_ANPM,0.0,0.00156,7e-06,0.0,0.004308,0.000429,0.001265,1e-06,0.0,0.037323,0.111929,0.144499,0.015195,0.084069,0.0,0.0,0.0,0.0,0.000237,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_GPT3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000456,0.000456,0.000377,0.000572,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_T5,0.0,7e-06,4.5e-05,0.0,0.001141,0.0,0.000731,0.003216,0.001025,0.309927,0.309927,0.314424,0.283554,0.0,0.016637,0.0,0.0,0.0,0.000417,0.001974,0.0,0.000972,0.0,0.0,0.0,0.0,0.001415,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,3e-06,0.0
RuRace_GPT3,,3.2e-05,2e-06,0.0,3.2e-05,3e-05,6.9e-05,2e-06,6e-06,0.001539,0.001539,0.001539,0.001539,2e-05,0.0,,,0.001341,0.003536,2e-06,0.0,3.1e-05,0.0,0.0,0.0,0.0,3.1e-05,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
RuRace_T5,,0.000167,0.001456,0.0,0.000167,0.0,0.000867,0.001456,6.9e-05,0.0,0.0,0.0,0.0,0.003323,0.0,,,0.0,0.0,0.001456,0.0,1e-06,0.0,0.0,0.0,0.0,1e-06,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
Deepseek,,0.083488,0.031401,6.7e-05,0.083488,0.0,0.011114,0.031401,1e-06,1e-06,1e-06,1e-06,1e-06,0.00021,0.0,,,0.0,0.132315,0.031401,6.7e-05,0.003191,2e-06,0.0,0.0,6.7e-05,0.003191,7e-06,,0.0,0.0,0.0,,0.0,0.0,,,0.0,1e-06,0.0
ChatGPT4o,,0.096339,0.0031,0.0,0.096339,0.002131,0.029995,0.0031,3e-06,0.081027,0.081027,0.081027,0.081027,0.0,0.0,,,3e-06,0.000354,0.0031,0.0,0.000119,0.0,0.0,0.0,0.0,0.000119,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
true_distractors,0.0,0.012075,1.3e-05,0.0,0.010901,0.000344,3e-06,6e-06,0.0,0.0,0.0,0.0,0.0,0.030143,0.0,0.0,0.0,0.0,0.114167,9e-06,0.0,7e-06,0.0,0.0,0.0,0.0,4e-06,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0


In [6]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: normaltest(np.log(data[key][col].dropna())).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  a_zero_mean = a - mean


Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,0.768302,1.9e-05,0.0,0.438619,0.058084,0.000859,1.5e-05,0.0,0.00134,0.0,0.0,0.002941,0.755698,0.007056,,,0.001924,,,,,,,,,,1.5e-05,,,,,,,,,,,,
BartDG_PM,0.0,0.326644,0.0,0.0,0.11541,0.022125,0.082216,0.0,0.0,0.019002,0.0,0.0,0.032082,0.543432,0.045695,,,0.000246,,,,,,,,,,0.0,,,,,,,,,,,,
BartDG_ANPM,0.0,0.374721,0.0,0.0,4e-06,0.883473,1.8e-05,0.0,0.0,0.135314,0.0,0.0,0.402092,0.525342,0.002737,,,0.008768,,,,,,,,,,0.0,,,,,,,,,,,,
MuSeRC_GPT3,0.0,0.02661,5e-05,0.0,0.017567,0.023684,0.003443,5.7e-05,0.0,0.0,0.0,0.0,0.0,0.568202,0.07525,,,0.00173,,,,,,,,,,5.7e-05,,,,,,,,,,,,
MuSeRC_T5,0.0,0.321012,5e-05,0.0,0.186068,0.079824,0.401027,0.0,4.4e-05,0.093709,0.093709,0.106936,0.083176,0.087157,0.851137,,,0.135647,,,,,,,,,,0.0,,,,,,,,,,,,
RuRace_GPT3,,0.37918,0.0,0.0,0.37918,0.078374,0.018916,0.0,0.0,0.841617,0.841617,0.841617,0.841617,0.081317,0.004723,,,0.003692,,,,,,,,,,0.0,,,,,,,,,,,,
RuRace_T5,,0.007,0.0,0.0,0.007,0.063653,0.003803,0.0,0.0,0.266384,0.266384,0.266384,0.266384,0.419496,0.002421,,,0.008369,,,,,,,,,,0.0,,,,,,,,,,,,
Deepseek,,0.639189,0.003091,0.0,0.639189,0.627472,0.000319,0.003091,2e-06,0.207178,0.207178,0.207178,0.207178,0.243919,0.0,,,0.66566,,,,,,,,,,0.003091,,,,,,,,,,,,
ChatGPT4o,,0.103026,0.0,0.0,0.103026,0.000778,0.349076,0.0,0.0,0.124149,0.124149,0.124149,0.124149,0.000504,0.008911,,,0.145169,,,,,,,,,,0.0,,,,,,,,,,,,
true_distractors,0.0,0.153057,0.0,0.0,0.164167,0.00181,0.276419,0.0,3.8e-05,0.266077,0.266077,0.313039,0.245019,0.437277,0.000531,,,0.538815,,,,,,,,,,0.0,,,,,,,,,,,,


In [7]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: normaltest(np.sqrt(data[key][col]).dropna()).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  b2 = skew(a, axis, _no_deco=True)
  b2 = kurtosis(a, axis, fisher=False, _no_deco=True)


Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,0.072552,2e-06,0.0,0.275675,0.339043,0.90238,2e-06,3e-06,0.253191,0.005542,0.005728,0.307431,0.282519,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_PM,0.0,6e-06,0.002668,0.0,0.008617,0.010024,0.45985,0.002559,0.000597,0.69112,0.021304,1e-06,0.709258,0.073171,0.000193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_ANPM,0.0,0.440036,0.000969,0.0,0.809643,0.050537,0.755273,0.001422,0.00178,0.735709,0.183167,0.192206,0.164695,0.795224,1.1e-05,0.0,0.0,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_GPT3,0.0,0.001289,0.0,0.0,0.001046,0.000676,4e-06,0.0,0.0,0.020468,0.020468,0.020255,0.018699,0.006544,4e-06,0.0,0.0,0.0,0.0,0.0,0.0,6e-06,0.0,0.0,0.0,0.0,6e-06,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_T5,0.0,0.072561,0.005666,0.0,0.306138,2e-06,0.18386,0.000942,0.001305,0.183685,0.183685,0.213134,0.15154,1.7e-05,0.242186,0.0,0.0,0.000163,0.001113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,9.3e-05,0.0
RuRace_GPT3,,0.098933,0.001233,0.0,0.098933,0.03738,0.353392,0.001233,0.000215,0.278785,0.278785,0.278785,0.278785,0.003119,2e-06,,,0.006867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
RuRace_T5,,0.248038,0.0,0.0,0.248038,0.0,0.724686,0.0,0.0,0.002208,0.002208,0.002208,0.002208,0.112884,6e-06,,,0.0,4.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
Deepseek,,0.735185,0.016863,0.0,0.735185,0.001167,0.001541,0.016863,0.001912,0.001879,0.001879,0.001879,0.001879,0.020843,0.0,,,0.007346,0.0,0.000316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000207,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
ChatGPT4o,,0.153603,0.0,0.0,0.153603,0.000502,0.353324,0.0,0.000276,0.14927,0.14927,0.14927,0.14927,0.0,8e-06,,,0.011893,8e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
true_distractors,0.0,0.124162,0.000729,0.0,0.172733,0.000122,0.240981,0.001144,4.6e-05,0.014272,0.014272,0.018089,0.016241,0.308996,1e-06,0.0,0.0,0.000138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0


In [8]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: normaltest(np.cbrt(data[key][col].dropna())).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  b2 = skew(a, axis, _no_deco=True)
  b2 = kurtosis(a, axis, fisher=False, _no_deco=True)


Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,0.297786,1.7e-05,0.0,0.726682,0.798353,0.643542,1.4e-05,2e-06,0.070574,0.000133,0.000163,0.102246,0.497275,3.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_PM,0.0,0.000805,0.000259,0.0,0.147676,0.013256,0.897933,0.000487,0.00016,0.387561,0.000547,0.0,0.488538,0.171588,0.001605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_ANPM,0.0,0.747102,1e-06,0.0,0.242685,0.162404,0.349649,1.6e-05,0.000262,0.999453,0.006585,0.007158,0.22439,0.961266,8.1e-05,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_GPT3,0.0,0.203669,3e-06,0.0,0.187833,0.056741,0.014269,4e-06,0.0,0.003967,0.003967,0.004473,0.003007,0.044915,0.000255,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_T5,0.0,0.433575,0.00527,0.0,0.806475,0.00016,0.578724,8e-05,0.000722,0.139789,0.139789,0.163709,0.116607,0.000632,0.442945,0.0,0.0,0.002585,0.000747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.000128,0.0
RuRace_GPT3,,0.416382,0.000145,0.0,0.416382,0.084153,0.62668,0.000145,3.6e-05,0.63669,0.63669,0.63669,0.63669,0.0114,3.6e-05,,,0.005203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
RuRace_T5,,0.487111,0.0,0.0,0.487111,2.1e-05,0.648814,0.0,0.0,0.019797,0.019797,0.019797,0.019797,0.215849,5.8e-05,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
Deepseek,,0.938973,0.010289,0.0,0.938973,0.021107,0.000513,0.010289,0.002309,0.013209,0.013209,0.013209,0.013209,0.061308,0.0,,,0.091511,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000576,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
ChatGPT4o,,0.138704,0.0,0.0,0.138704,0.00034,0.411569,0.0,5.5e-05,0.151479,0.151479,0.151479,0.151479,2e-06,0.000113,,,0.05384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
true_distractors,0.0,0.210163,1.5e-05,0.0,0.281112,0.000117,0.834553,5.3e-05,0.000451,0.130832,0.130832,0.156946,0.139868,0.404617,8e-06,0.0,0.0,0.006051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0


In [9]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: normaltest(zscore(data[key][col].dropna())).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  col: normaltest(zscore(data[key][col].dropna())).pvalue for col in COLUMN_NAMES
  col: normaltest(zscore(data[key][col].dropna())).pvalue for col in COLUMN_NAMES
  col: normaltest(zscore(data[key][col].dropna())).pvalue for col in COLUMN_NAMES
  col: normaltest(zscore(data[key][col].dropna())).pvalue for col in COLUMN_NAMES


Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,1.1e-05,0.0,0.0,0.00022,0.000166,0.04029,0.0,0.0,0.270824,0.222088,0.285255,0.19948,0.010015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,1e-06,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_PM,0.0,0.0,1e-06,0.0,0.0,0.0004,7.3e-05,0.0,0.0,0.078282,0.131588,0.103734,0.054752,0.001481,0.0,0.0,0.0,0.0,0.000179,1e-06,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_ANPM,0.0,0.00156,7e-06,0.0,0.004308,0.000429,0.001265,1e-06,0.0,0.037323,0.111929,0.144499,0.015195,0.084069,0.0,0.0,0.0,0.0,0.000237,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_GPT3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000456,0.000456,0.000377,0.000572,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_T5,0.0,7e-06,4.5e-05,0.0,0.001141,0.0,0.000731,0.003216,0.001025,0.309927,0.309927,0.314424,0.283554,0.0,0.016637,0.0,0.0,0.0,0.000417,0.001974,0.0,0.000972,0.0,0.0,0.0,0.0,0.001415,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,3e-06,0.0
RuRace_GPT3,,3.2e-05,2e-06,0.0,3.2e-05,3e-05,6.9e-05,2e-06,6e-06,0.001539,0.001539,0.001539,0.001539,2e-05,0.0,,,0.001341,0.003536,2e-06,0.0,3.1e-05,0.0,0.0,0.0,0.0,3.1e-05,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
RuRace_T5,,0.000167,0.001456,0.0,0.000167,0.0,0.000867,0.001456,6.9e-05,0.0,0.0,0.0,0.0,0.003323,0.0,,,0.0,0.0,0.001456,0.0,1e-06,0.0,0.0,0.0,0.0,1e-06,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
Deepseek,,0.083488,0.031401,6.7e-05,0.083488,0.0,0.011114,0.031401,1e-06,1e-06,1e-06,1e-06,1e-06,0.00021,0.0,,,0.0,0.132315,0.031401,6.7e-05,0.003191,2e-06,0.0,0.0,6.7e-05,0.003191,7e-06,,0.0,0.0,0.0,,0.0,0.0,,,0.0,1e-06,0.0
ChatGPT4o,,0.096339,0.0031,0.0,0.096339,0.002131,0.029995,0.0031,3e-06,0.081027,0.081027,0.081027,0.081027,0.0,0.0,,,3e-06,0.000354,0.0031,0.0,0.000119,0.0,0.0,0.0,0.0,0.000119,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
true_distractors,0.0,0.012075,1.3e-05,0.0,0.010901,0.000344,3e-06,6e-06,0.0,0.0,0.0,0.0,0.0,0.030143,0.0,0.0,0.0,0.0,0.114167,9e-06,0.0,7e-06,0.0,0.0,0.0,0.0,4e-06,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0


In [10]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: normaltest(gzscore(data[key][col].dropna())).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
  a_zero_mean = a - mean
  return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan

Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,0.768302,1.9e-05,0.0,0.438619,0.058084,0.000859,1.5e-05,0.0,0.00134,0.0,0.0,0.002941,0.755698,0.007056,,,0.001924,,,,,,,,,,1.5e-05,,,,,,,,,,,,
BartDG_PM,0.0,0.326644,0.0,0.0,0.11541,0.022125,0.082216,0.0,0.0,0.019002,0.0,0.0,0.032082,0.543432,0.045695,,,0.000246,,,,,,,,,,0.0,,,,,,,,,,,,
BartDG_ANPM,0.0,0.374721,0.0,0.0,4e-06,0.883473,1.8e-05,0.0,0.0,0.135314,0.0,0.0,0.402092,0.525342,0.002737,,,0.008768,,,,,,,,,,0.0,,,,,,,,,,,,
MuSeRC_GPT3,0.0,0.02661,5e-05,0.0,0.017567,0.023684,0.003443,5.7e-05,0.0,0.0,0.0,0.0,0.0,0.568202,0.07525,,,0.00173,,,,,,,,,,5.7e-05,,,,,,,,,,,,
MuSeRC_T5,0.0,0.321012,5e-05,0.0,0.186068,0.079824,0.401027,0.0,4.4e-05,0.093709,0.093709,0.106936,0.083176,0.087157,0.851137,,,0.135647,,,,,,,,,,0.0,,,,,,,,,,,,
RuRace_GPT3,,0.37918,0.0,0.0,0.37918,0.078374,0.018916,0.0,0.0,0.841617,0.841617,0.841617,0.841617,0.081317,0.004723,,,0.003692,,,,,,,,,,0.0,,,,,,,,,,,,
RuRace_T5,,0.007,0.0,0.0,0.007,0.063653,0.003803,0.0,0.0,0.266384,0.266384,0.266384,0.266384,0.419496,0.002421,,,0.008369,,,,,,,,,,0.0,,,,,,,,,,,,
Deepseek,,0.639189,0.003091,0.0,0.639189,0.627472,0.000319,0.003091,2e-06,0.207178,0.207178,0.207178,0.207178,0.243919,0.0,,,0.66566,,,,,,,,,,0.003091,,,,,,,,,,,,
ChatGPT4o,,0.103026,0.0,0.0,0.103026,0.000778,0.349076,0.0,0.0,0.124149,0.124149,0.124149,0.124149,0.000504,0.008911,,,0.145169,,,,,,,,,,0.0,,,,,,,,,,,,
true_distractors,0.0,0.153057,0.0,0.0,0.164167,0.00181,0.276419,0.0,3.8e-05,0.266077,0.266077,0.313039,0.245019,0.437277,0.000531,,,0.538815,,,,,,,,,,0.0,,,,,,,,,,,,


In [11]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: normaltest(yeojohnson(data[key][col].dropna())[0]).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  b2 = skew(a, axis, _no_deco=True)
  b2 = kurtosis(a, axis, fisher=False, _no_deco=True)


Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,0.685202,0.0,0.0,0.899112,0.880239,0.882622,0.0,0.0,0.562266,0.239194,0.278009,0.530186,0.676953,0.997276,0.0,0.0,0.971008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_PM,0.0,0.787952,0.0,0.0,0.441919,0.017377,0.941523,0.0,0.0,0.748537,0.279589,0.123049,0.740213,0.787852,0.807036,0.0,0.0,0.763774,0.000373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
BartDG_ANPM,0.0,0.732364,0.0,0.0,0.873689,0.615823,0.841152,0.0,0.0,0.977945,0.606335,0.681152,0.271139,0.933476,0.927901,0.0,0.0,0.446328,0.000344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_GPT3,0.0,0.77563,0.0,0.0,0.690673,0.316668,0.180736,0.0,0.0,0.0,0.0,0.0,0.0,0.999106,0.785618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
MuSeRC_T5,,0.984155,0.0,0.0,0.976027,0.206107,0.984583,0.0,0.0,0.160051,0.160051,0.182647,0.131781,0.614654,0.919671,0.0,0.0,0.249559,0.000156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.00014,0.0
RuRace_GPT3,,0.75563,0.0,0.0,0.75563,0.044268,0.680091,0.0,0.0,0.981129,0.981129,0.981129,0.981129,0.466485,0.726712,,,0.001153,0.000381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
RuRace_T5,,0.535536,0.0,0.0,0.535536,0.975193,0.874755,0.0,0.0,0.401128,0.401128,0.401128,0.401128,0.383651,0.060916,,,0.123441,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
Deepseek,,0.953381,0.01623,0.0,0.953381,0.9406,0.000195,0.01623,0.0,0.998334,0.998334,0.998334,0.998334,0.366949,0.267446,,,0.802473,0.113958,0.01771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003523,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
ChatGPT4o,,0.130005,0.0,0.0,0.130005,0.000289,0.378078,0.0,0.0,0.139386,0.139386,0.139386,0.139386,0.095483,0.958643,,,0.036824,0.012909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0
true_distractors,0.0,0.177568,0.0,0.0,0.244579,0.000999,0.980423,0.0,0.0,0.427364,0.427364,0.488051,0.413232,0.379407,0.987788,0.0,0.0,0.975502,0.050328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,,,0.0,0.0,0.0


In [21]:
normaltest_results = []
for key in SOURCE_NAMES:
    normaltest_result = {
        col: shapiro(yeojohnson(data[key][col].dropna())[0]).pvalue for col in COLUMN_NAMES
    }
    normaltest_results.append(normaltest_result)
normaltest_results = pd.DataFrame(normaltest_results, index=SOURCE_NAMES)
normaltest_results.style.apply(lambda x: ["background: green" if v > 0.05 else "" for v in x], axis = 1)

  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,Number of Sentences,Number of Words,Number of Clauses,Number of T-Units,Mean Sentence Length,Mean Clause Length,Mean T-Unit Length,Mean Number of Clauses per Sentence,Mean Number of Clauses per T-Unit,Mean Tree Depth,Median Tree Depth,Minimum Tree Depth,Maximum Tree Depth,Mean Dependency Distance,Node-to-Terminal-Node Ratio,Average Levenshtein Distance between POS,Average Levenshtein Distance between deprel,Average NP Length,Complex NP Ratio,Number of Combined Clauses,Number of Coordinate Clauses,Number of Subordinate Clauses,Coordinate to Combined Clause Ratio,Subordinate to Combined Clause Ratio,Coordinate to Subordinate Clause Ratio,Coordinate Clause to Sentence Ratio,Subordinate Clause to Sentence Ratio,ROOT_ratio,root_ratio,acl_ratio,acl:relcl_ratio,advcl_ratio,advcl:relcl_ratio,ccomp_ratio,csubj_ratio,csubj:outer_ratio,nsubj:outer_ratio,parataxis_ratio,xcomp_ratio,conj_ratio
BartDG,0.0,0.01595,0.0,0.0,0.040947,0.082381,0.024822,0.0,0.0,0.0,0.0,0.0,0.0,0.593904,0.04253,0.0,0.0,0.021379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
BartDG_PM,0.0,0.069674,0.0,0.0,0.087876,0.015693,0.198261,0.0,0.0,0.0,0.0,0.0,0.0,0.257471,0.019704,0.0,0.0,0.004207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
BartDG_ANPM,0.0,0.081883,0.0,0.0,0.154879,0.0275,0.143055,0.0,0.0,0.0,0.0,0.0,0.0,0.461584,0.015713,0.0,0.0,0.000176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
MuSeRC_GPT3,0.0,0.011065,0.0,0.0,0.008666,0.000713,0.000765,0.0,0.0,0.0,0.0,0.0,0.0,0.005344,0.001007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
MuSeRC_T5,0.0,0.25805,0.0,0.0,0.15787,0.076806,0.127531,0.0,0.0,5.2e-05,5.2e-05,4.6e-05,4.1e-05,0.534607,0.048158,0.0,0.0,0.00018,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
RuRace_GPT3,1.0,0.06546,0.0,0.0,0.06546,0.044619,0.111224,0.0,0.0,0.0,0.0,0.0,0.0,0.550325,0.041505,1.0,1.0,0.000707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
RuRace_T5,1.0,0.032258,0.0,0.0,0.032258,0.187203,0.152092,0.0,0.0,0.0,0.0,0.0,0.0,0.480784,0.012807,1.0,1.0,0.000239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
Deepseek,1.0,0.028797,0.0,0.0,0.028797,0.387258,0.005938,0.0,0.0,0.0,0.0,0.0,0.0,0.291671,0.020799,1.0,1.0,0.084291,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
ChatGPT4o,1.0,0.003852,0.0,0.0,0.003852,0.003148,0.008623,0.0,0.0,0.0,0.0,0.0,0.0,0.086684,0.001028,1.0,1.0,0.002417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
true_distractors,0.0,0.126274,0.0,0.0,0.169712,0.001712,0.250038,0.0,0.0,0.0,0.0,0.0,0.0,0.595282,0.060401,0.0,0.0,0.022481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [12]:
SOURCE_NAMES

['BartDG',
 'BartDG_PM',
 'BartDG_ANPM',
 'MuSeRC_GPT3',
 'MuSeRC_T5',
 'RuRace_GPT3',
 'RuRace_T5',
 'Deepseek',
 'ChatGPT4o',
 'true_distractors']

In [13]:
yeoj = lambda x: yeojohnson(x)[0]

In [14]:
anova_results = dict()

for metric in COLUMN_NAMES:
    metric_vectors = []

    for source in SOURCE_NAMES:
        metric_vectors.append(yeoj(data[source][metric].dropna()))

    anova_results[metric] = f_oneway(*metric_vectors)

anova_results

  res = hypotest_fun_out(*samples, **kwds)


{'Number of Sentences': F_onewayResult(statistic=np.float64(4.4754455021172096e+17), pvalue=np.float64(0.0)),
 'Number of Words': F_onewayResult(statistic=np.float64(413.29625476925446), pvalue=np.float64(0.0)),
 'Number of Clauses': F_onewayResult(statistic=np.float64(558.2280419639201), pvalue=np.float64(0.0)),
 'Number of T-Units': F_onewayResult(statistic=np.float64(97339.82559188112), pvalue=np.float64(0.0)),
 'Mean Sentence Length': F_onewayResult(statistic=np.float64(236.65993908653698), pvalue=np.float64(7.612223800329311e-283)),
 'Mean Clause Length': F_onewayResult(statistic=np.float64(232.93592598499035), pvalue=np.float64(8.674788214835288e-280)),
 'Mean T-Unit Length': F_onewayResult(statistic=np.float64(130.192300264167), pvalue=np.float64(3.875831121046862e-182)),
 'Mean Number of Clauses per Sentence': F_onewayResult(statistic=np.float64(596.5262924075361), pvalue=np.float64(0.0)),
 'Mean Number of Clauses per T-Unit': F_onewayResult(statistic=np.float64(1154.3126750871

In [15]:
avg_syn_stats = pd.read_excel("../data/avg_syntaxcomp_metrics_spacy.xlsx", index_col="Unnamed: 0")

In [16]:
avg_syn_stats

Unnamed: 0,BartDG,BartDG_PM,BartDG_ANPM,MuSeRC_GPT3,MuSeRC_T5,RuRace_GPT3,RuRace_T5,Deepseek,ChatGPT4o,true_distractors
Number of Sentences,1.04,1.06,1.04,1.01,1.01,1.0,1.0,1.0,1.0,1.01
Number of Words,7.93,9.14,8.75,5.32,7.45,9.64,8.69,13.04,9.46,12.06
Number of Clauses,1.47,1.84,1.89,1.53,1.68,1.8,1.86,2.25,1.75,2.19
Number of T-Units,1.14,1.14,1.07,1.14,1.12,1.1,1.14,1.4,1.17,1.32
Mean Sentence Length,7.8,8.75,8.65,5.28,7.31,9.64,8.69,13.04,9.46,12.0
Mean Clause Length,5.87,5.63,5.27,3.67,4.81,6.2,5.27,6.67,6.13,6.48
Mean T-Unit Length,7.25,8.3,8.44,4.78,6.76,9.1,7.97,10.39,8.47,9.64
Mean Number of Clauses per Sentence,1.43,1.75,1.86,1.51,1.65,1.8,1.86,2.25,1.75,2.18
Mean Number of Clauses per T-Unit,1.3,1.65,1.8,1.36,1.51,1.66,1.67,1.73,1.54,1.7
Mean Tree Depth,3.72,3.96,4.07,3.08,3.74,4.12,3.82,4.9,4.27,4.67


In [17]:
avg_syn_stats["F-score"] = {key: val.statistic for key, val in anova_results.items()}
avg_syn_stats["p-value"] = {key: val.pvalue for key, val in anova_results.items()}

In [18]:
avg_syn_stats

Unnamed: 0,BartDG,BartDG_PM,BartDG_ANPM,MuSeRC_GPT3,MuSeRC_T5,RuRace_GPT3,RuRace_T5,Deepseek,ChatGPT4o,true_distractors,F-score,p-value
Number of Sentences,1.04,1.06,1.04,1.01,1.01,1.0,1.0,1.0,1.0,1.01,4.475446e+17,0.0
Number of Words,7.93,9.14,8.75,5.32,7.45,9.64,8.69,13.04,9.46,12.06,413.2963,0.0
Number of Clauses,1.47,1.84,1.89,1.53,1.68,1.8,1.86,2.25,1.75,2.19,558.228,0.0
Number of T-Units,1.14,1.14,1.07,1.14,1.12,1.1,1.14,1.4,1.17,1.32,97339.83,0.0
Mean Sentence Length,7.8,8.75,8.65,5.28,7.31,9.64,8.69,13.04,9.46,12.0,236.6599,7.612223999999999e-283
Mean Clause Length,5.87,5.63,5.27,3.67,4.81,6.2,5.27,6.67,6.13,6.48,232.9359,8.674788e-280
Mean T-Unit Length,7.25,8.3,8.44,4.78,6.76,9.1,7.97,10.39,8.47,9.64,130.1923,3.8758309999999996e-182
Mean Number of Clauses per Sentence,1.43,1.75,1.86,1.51,1.65,1.8,1.86,2.25,1.75,2.18,596.5263,0.0
Mean Number of Clauses per T-Unit,1.3,1.65,1.8,1.36,1.51,1.66,1.67,1.73,1.54,1.7,1154.313,0.0
Mean Tree Depth,3.72,3.96,4.07,3.08,3.74,4.12,3.82,4.9,4.27,4.67,802.4674,0.0


In [19]:
avg_syn_stats.to_excel(
    "../data/avg_syntaxcomp_metrics_anova_spacy.xlsx", float_format="%.2f"
)