## SPARQL proficiency

To assess the robustness of the SPARQL proficiency metric, we recompute proficiency scores using four alternative weighting schemes, including equal-weight and correctness-dominant configurations, while keeping all sub-metrics constant.

In [2]:
import json
import time
import re
import requests
import pandas as pd

def compute_sparql_proficiency_sensitivity_check(
    row,
    weight_wc,
    weight_fed,
    weight_feat,
    weight_diversity
):
    return (
        row['correct_normalized'] * weight_wc +
        row['federated_bonus'] * weight_fed +
        row['normalized_sparql_feature_metric'] * weight_feat +
        row['diversity_metric'] * weight_diversity
    )

weighting_schemes = {
    "original": {
        "weight_wc": 0.10,
        "weight_fed": 0.10,
        "weight_feat": 0.40,
        "weight_diversity": 0.40
    },
    "equal": {
        "weight_wc": 0.25,
        "weight_fed": 0.25,
        "weight_feat": 0.25,
        "weight_diversity": 0.25
    },
    "correctness_heavy": {
        "weight_wc": 0.40,
        "weight_fed": 0.20,
        "weight_feat": 0.20,
        "weight_diversity": 0.20
    },
    "feature_heavy": {
        "weight_wc": 0.10,
        "weight_fed": 0.10,
        "weight_feat": 0.50,
        "weight_diversity": 0.30
    }
}

df_proficiency_final = pd.read_csv(
    "outputs/sparql_proficiency_stage_three.csv",
    index_col=0
)

for scheme_name, weights in weighting_schemes.items():
    df_proficiency_final[f'sparql_proficiency_{scheme_name}'] = (
        df_proficiency_final.apply(
            compute_sparql_proficiency_sensitivity_check,
            axis=1,
            **weights
        )
    )

df_proficiency_final.to_csv("outputs/sparql_proficiency_sensitivity.csv")

# Readability

In [3]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# Load data
df_readability = pd.read_csv('outputs/readability_scores.csv', index_col=0)
df_sparql = pd.read_csv('outputs/sparql_proficiency_sensitivity.csv', index_col=0)
df_alternation = pd.read_csv('outputs/alternation_scores.csv', index_col=0)

# Merge datasets
df_temp = pd.merge(
    df_readability[['story_id', 'flesch_reading_ease', 'normalized_readability']],
    df_sparql,
    on='story_id',
    how='inner'
)

df_combined = pd.merge(
    df_temp,
    df_alternation[['story_id', 'alternation_score']],
    on='story_id',
    how='inner'
)

# Filter out stories without text
story_no_text = ['SXefpzf4', 'EzsIH_Et', '6yGct8pP']
df_filtered = df_combined[~df_combined['story_id'].isin(story_no_text)]

# Identify all SPARQL proficiency variants
sparql_columns = [
    col for col in df_filtered.columns
    if col.startswith('sparql_proficiency_')
]

# Run sensitivity analysis
results = []

for col in sparql_columns:
    pearson_corr, pearson_p = pearsonr(
        df_filtered['flesch_reading_ease'],
        df_filtered[col]
    )

    spearman_corr, spearman_p = spearmanr(
        df_filtered['flesch_reading_ease'],
        df_filtered[col]
    )

    results.append({
        'weighting_scheme': col.replace('sparql_proficiency_', ''),
        'pearson_r': pearson_corr,
        'pearson_p': pearson_p,
        'spearman_r': spearman_corr,
        'spearman_p': spearman_p
    })

# Convert to DataFrame for easy inspection / export
df_results = pd.DataFrame(results)

print(df_results.round(3))



    weighting_scheme  pearson_r  pearson_p  spearman_r  spearman_p
0           original     -0.215      0.143      -0.291       0.045
1              equal     -0.153      0.299      -0.282       0.052
2  correctness_heavy     -0.143      0.331      -0.253       0.083
3      feature_heavy     -0.217      0.139      -0.297       0.041


**Pattern**

Direction is stable across all weighting schemes: While SPARQL proficiency increases, readability decreases.

Effect sizes are small-to-moderate (|ρ| ≈ 0.25–0.30)

Significance is marginal and scheme-dependent (significant for original and feature-heavy (Spearman), trends but not significant elsewhere)

**Summary** The relationship does not depend on a specific weighting. But it is weak and exploratory. In conclusion, readability is influenced by many factors beyond query proficiency, which actually supports our theoretical claim that technical sophistication can trade off with surface readability — but not strongly or deterministically.

## Coherence


In [5]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# Load data
df_coherence = pd.read_csv('outputs/coherence_scores.csv', index_col=0)
df_sparql = pd.read_csv('outputs/sparql_proficiency_sensitivity.csv', index_col=0)
df_alternation = pd.read_csv('outputs/alternation_scores.csv', index_col=0)

# Merge datasets
df_temp = pd.merge(
    df_coherence[['story_id', 'coherence_score']],
    df_sparql,
    on='story_id',
    how='inner'
)

df_combined = pd.merge(
    df_temp,
    df_alternation[['story_id', 'alternation_score']],
    on='story_id',
    how='inner'
)

# Filter out stories without text
story_no_text = ['SXefpzf4', 'EzsIH_Et', '6yGct8pP']
df_filtered = df_combined[~df_combined['story_id'].isin(story_no_text)]

# Identify SPARQL proficiency variants
sparql_columns = [
    col for col in df_filtered.columns
    if col.startswith('sparql_proficiency_')
]

# Run sensitivity analysis
results = []

for col in sparql_columns:
    pearson_corr, pearson_p = pearsonr(
        df_filtered['coherence_score'],
        df_filtered[col]
    )

    spearman_corr, spearman_p = spearmanr(
        df_filtered['coherence_score'],
        df_filtered[col]
    )

    results.append({
        'weighting_scheme': col.replace('sparql_proficiency_', ''),
        'pearson_r': pearson_corr,
        'pearson_p': pearson_p,
        'spearman_r': spearman_corr,
        'spearman_p': spearman_p
    })

# Convert to DataFrame
df_results = pd.DataFrame(results)

print(df_results.round(3))


    weighting_scheme  pearson_r  pearson_p  spearman_r  spearman_p
0           original      0.200      0.172       0.438       0.002
1              equal      0.135      0.361       0.429       0.002
2  correctness_heavy      0.128      0.387       0.373       0.009
3      feature_heavy      0.197      0.180       0.428       0.002


**Pattern** Spearman correlations are strong and highly stable

ρ ≈ 0.37–0.44

p ≤ 0.01 across all schemes

Pearson is weaker and non-significant (fine; coherence is bounded/ordinal)

**Summary** The SPARQL–coherence relationship is monotonic, not linear. It is robust to all weighting schemes and it is not an artifact of how proficiency is operationalized.

## Alternation (good structure)


In [6]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr

# Load data
df_alternation = pd.read_csv('outputs/alternation_scores.csv', index_col=0)
df_sparql = pd.read_csv('outputs/sparql_proficiency_sensitivity.csv', index_col=0)

# Merge datasets
df_combined = pd.merge(
    df_alternation[['story_id', 'alternation_score']],
    df_sparql,
    on='story_id',
    how='inner'
)

# Filter out stories without text
story_no_text = ['SXefpzf4', 'EzsIH_Et', '6yGct8pP']
df_filtered = df_combined[~df_combined['story_id'].isin(story_no_text)]

# Identify SPARQL proficiency variants
sparql_columns = [
    col for col in df_filtered.columns
    if col.startswith('sparql_proficiency_')
]

# Run sensitivity analysis
results = []

for col in sparql_columns:
    x = np.array(df_filtered['alternation_score'])
    y = np.array(df_filtered[col])

    pearson_corr, pearson_p = pearsonr(x, y)
    spearman_corr, spearman_p = spearmanr(x, y)

    results.append({
        'weighting_scheme': col.replace('sparql_proficiency_', ''),
        'pearson_r': pearson_corr,
        'pearson_p': pearson_p,
        'spearman_r': spearman_corr,
        'spearman_p': spearman_p
    })

# Convert to DataFrame
df_results = pd.DataFrame(results)

print(df_results.round(3))


    weighting_scheme  pearson_r  pearson_p  spearman_r  spearman_p
0           original      0.221      0.131       0.080       0.589
1              equal      0.222      0.130       0.016       0.914
2  correctness_heavy      0.148      0.315      -0.001       0.997
3      feature_heavy      0.229      0.118       0.092       0.536


**Pattern** Pearson: small positive correlations (≈ 0.15–0.23), non-significant. Spearman: near zero, unstable, non-significant

**Summary** No robust association between SPARQL proficiency and alternation. Any apparent relationship in the original analysis was likely weak or sample-sensitive. Alternation is probably a stylistic or pedagogical choice, not a skill proxy.