## 1 — Imports and Configuration

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression
from IPython.display import display

file_paths = {
    "en_es": "final_en_es_2.csv",
    "en_it": "final_en_it_2.csv",
    "en_pt": "final_en_pt_2.csv",
}

pair_labels = {
    "en_es": "English–Spanish",
    "en_it": "English–Italian",
    "en_pt": "English–Portuguese",
}


## 2 — Global Correlations (Spearman + Pearson log-HF)

In [2]:
global_results = []

for key, path in file_paths.items():
    df = pd.read_csv(path)

    
    df["src_median_hf_days"] = pd.to_numeric(df["src_median_hf_days"], errors="coerce")
    df["tgt_median_hf_days"] = pd.to_numeric(df["tgt_median_hf_days"], errors="coerce")
    df = df.dropna(subset=["src_median_hf_days", "tgt_median_hf_days"])

    
    rho, p_spear = stats.spearmanr(df["src_median_hf_days"], df["tgt_median_hf_days"])

    
    r, p_pear = stats.pearsonr(np.log1p(df["src_median_hf_days"]),
                               np.log1p(df["tgt_median_hf_days"]))

    global_results.append([
        key, len(df), rho, p_spear, r, p_pear
    ])

global_df = pd.DataFrame(global_results,
    columns=["pair","n","spearman_r","spearman_p",
             "pearson_r_loghf_days","pearson_p_loghf"])

display(global_df)


Unnamed: 0,pair,n,spearman_r,spearman_p,pearson_r_loghf_days,pearson_p_loghf
0,en_es,1322,0.323143,1.6510620000000002e-33,0.2789,4.802133000000001e-25
1,en_it,955,0.082948,0.0103347,0.108256,0.0008054347
2,en_pt,1138,0.085544,0.003878367,0.077501,0.008909892


## 3 — POS-Wise Correlations

In [3]:
pos_results = []

for key, path in file_paths.items():
    df = pd.read_csv(path)

    for col in ["src_median_hf_days","tgt_median_hf_days"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df = df.dropna(subset=["src_pos","src_median_hf_days","tgt_median_hf_days"])

    for pos, sub in df.groupby("src_pos"):
        if len(sub) < 30:
            continue
        rho, p = stats.spearmanr(sub["src_median_hf_days"], sub["tgt_median_hf_days"])
        r, p_l = stats.pearsonr(np.log1p(sub["src_median_hf_days"]),
                                np.log1p(sub["tgt_median_hf_days"]))
        pos_results.append([key, pos, len(sub), rho, p, r, p_l])

pos_df = pd.DataFrame(pos_results,
   columns=["pair","pos","n","spearman_r","spearman_p",
            "pearson_r_loghf_days","pearson_p_loghf"])

display(pos_df)


Unnamed: 0,pair,pos,n,spearman_r,spearman_p,pearson_r_loghf_days,pearson_p_loghf
0,en_es,adj,122,0.291909,0.001104687,0.238512,0.00815409
1,en_es,adv,71,0.331972,0.004679901,0.408839,0.0004003709
2,en_es,noun,955,0.300907,1.943981e-21,0.252502,2.345187e-15
3,en_es,verb,107,0.158453,0.1030762,0.143979,0.1389899
4,en_it,adj,124,0.080586,0.3736128,0.11933,0.1868128
5,en_it,adv,83,0.014139,0.899044,0.171598,0.1208642
6,en_it,noun,575,0.046359,0.2670687,0.07183,0.08527068
7,en_it,verb,102,0.098041,0.3269248,0.151199,0.1292781
8,en_pt,adj,96,0.169679,0.09838466,0.290772,0.004052817
9,en_pt,adv,63,0.201904,0.1125439,0.134891,0.2918601


## 4 — Frequency-Bin Correlations

In [4]:
freq_results = []

for key, path in file_paths.items():
    df = pd.read_csv(path)

    
    for col in ["frequency", "src_median_hf_days", "tgt_median_hf_days"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    
    df = df.dropna(subset=["frequency", "src_median_hf_days", "tgt_median_hf_days"])

    
    df["freq_bin"] = pd.qcut(df["frequency"], q=4, labels=False, duplicates="drop")

    for b, sub in df.groupby("freq_bin"):
        if len(sub) < 30:
            continue

        
        rho, p = stats.spearmanr(sub["src_median_hf_days"], sub["tgt_median_hf_days"])

       
        r, p_l = stats.pearsonr(
            np.log1p(sub["src_median_hf_days"]),
            np.log1p(sub["tgt_median_hf_days"])
        )

        freq_results.append([key, b, len(sub), rho, p, r, p_l])

freq_df = pd.DataFrame(
    freq_results,
    columns=["pair", "freq_bin", "n",
             "spearman_r", "spearman_p",
             "pearson_r_loghf_days", "pearson_p_loghf"]
)

display(freq_df)


Unnamed: 0,pair,freq_bin,n,spearman_r,spearman_p,pearson_r_loghf_days,pearson_p_loghf
0,en_es,0,335,0.324998,1.112422e-09,0.292881,4.740783e-08
1,en_es,1,328,0.256744,2.461599e-06,0.204437,0.0001931354
2,en_es,2,332,0.324132,1.468845e-09,0.253865,2.797146e-06
3,en_es,3,327,0.376959,1.759199e-12,0.353883,4.420711e-11
4,en_it,0,243,-0.014219,0.8254639,0.015133,0.8144491
5,en_it,1,235,0.064081,0.3280201,0.106293,0.104087
6,en_it,2,238,0.052719,0.4181765,0.058074,0.372417
7,en_it,3,239,0.236488,0.0002248591,0.301516,2.055975e-06
8,en_pt,0,289,0.142352,0.01544221,0.149385,0.01099616
9,en_pt,1,291,0.085342,0.1464425,0.023585,0.6886697


## 5 — Helper functions for log-partial correlations

In [5]:
def partial_corr_loghf(x_raw, y_raw, controls):
    
    x = np.log1p(x_raw.values)
    y = np.log1p(y_raw.values)
    Z = np.column_stack([c.values for c in controls])

    lr_x = LinearRegression().fit(Z, x)
    lr_y = LinearRegression().fit(Z, y)

    x_res = x - lr_x.predict(Z)
    y_res = y - lr_y.predict(Z)

    r, p = stats.pearsonr(x_res, y_res)
    return r, p

def add_bins(df, col, n_bins=4, new_col=None):
    if new_col is None:
        new_col = col + "_bin"
    df[new_col] = pd.qcut(df[col], q=n_bins, labels=False, duplicates="drop")
    return df


## 6 — Partial Correlations Across Bins (session_count, length, concreteness, frequency)

In [6]:
partial_bin_results = []

for key, path in file_paths.items():
    df = pd.read_csv(path)

    #
    cols = ["src_median_hf_days","tgt_median_hf_days",
            "src_session_count","tgt_session_count",
            "src_lemma_length","tgt_lemma_length",
            "concreteness","frequency"]
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(subset=cols)

    
    df = add_bins(df, "src_session_count", new_col="src_session_bin")
    df = add_bins(df, "tgt_session_count", new_col="tgt_session_bin")
    df = add_bins(df, "src_lemma_length", new_col="src_len_bin")
    df = add_bins(df, "tgt_lemma_length", new_col="tgt_len_bin")
    df = add_bins(df, "concreteness", new_col="concreteness_bin")
    df = add_bins(df, "frequency", new_col="frequency_bin")

    for bin_col in ["src_session_bin","tgt_session_bin",
                    "src_len_bin","tgt_len_bin",
                    "concreteness_bin","frequency_bin"]:
        for b, sub in df.groupby(bin_col):
            if len(sub) < 30:
                continue
            r, p = partial_corr_loghf(
                sub["src_median_hf_days"], sub["tgt_median_hf_days"],
                [
                    sub["src_session_count"],
                    sub["tgt_session_count"],
                    sub["src_lemma_length"],
                    sub["tgt_lemma_length"],
                    sub["concreteness"],
                    sub["frequency"]
                ]
            )
            partial_bin_results.append([key, bin_col, b, len(sub), r, p])

partial_bins_df = pd.DataFrame(partial_bin_results,
   columns=["pair","bin_type","bin","n","partial_r_loghf","p"])

display(partial_bins_df)


Unnamed: 0,pair,bin_type,bin,n,partial_r_loghf,p
0,en_es,src_session_bin,0,334,0.025221,6.460302e-01
1,en_es,src_session_bin,1,328,0.037193,5.020556e-01
2,en_es,src_session_bin,2,329,0.083905,1.288187e-01
3,en_es,src_session_bin,3,331,0.287843,9.807413e-08
4,en_es,tgt_session_bin,0,335,0.148968,6.302269e-03
...,...,...,...,...,...,...
67,en_pt,concreteness_bin,3,282,0.083369,1.626494e-01
68,en_pt,frequency_bin,0,289,0.026686,6.514257e-01
69,en_pt,frequency_bin,1,291,-0.032853,5.767286e-01
70,en_pt,frequency_bin,2,276,-0.064777,2.835414e-01


## 7 — POS-Wise Partial Correlations (log-HF)

In [7]:
partial_pos_results = []

for key, path in file_paths.items():
    df = pd.read_csv(path)

    cols = ["src_median_hf_days","tgt_median_hf_days",
            "src_session_count","tgt_session_count",
            "src_lemma_length","tgt_lemma_length",
            "concreteness","frequency"]
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(subset=cols + ["src_pos"])

    for pos, sub in df.groupby("src_pos"):
        if len(sub) < 30:
            continue
        r, p = partial_corr_loghf(
            sub["src_median_hf_days"], sub["tgt_median_hf_days"],
            [
                sub["src_session_count"],
                sub["tgt_session_count"],
                sub["src_lemma_length"],
                sub["tgt_lemma_length"],
                sub["concreteness"],
                sub["frequency"]
            ]
        )
        partial_pos_results.append([key, pos, len(sub), r, p])

partial_pos_df = pd.DataFrame(partial_pos_results,
   columns=["pair","pos","n","partial_r_loghf","p"])

display(partial_pos_df)


Unnamed: 0,pair,pos,n,partial_r_loghf,p
0,en_es,adj,122,0.122092,0.180349
1,en_es,adv,71,0.29355,0.01297
2,en_es,noun,955,0.12194,0.000158
3,en_es,verb,107,0.118722,0.223239
4,en_it,adj,124,0.008642,0.924108
5,en_it,adv,83,-0.091876,0.408756
6,en_it,noun,575,0.030952,0.458829
7,en_it,verb,102,0.031205,0.755538
8,en_pt,adj,96,0.098596,0.339217
9,en_pt,adv,63,-0.033363,0.795186


## 8 — Semantic-Field Partial Correlations (log-HF)

In [9]:
partial_sem_results = []

for key, path in file_paths.items():
    df = pd.read_csv(path)

    cols = ["src_median_hf_days","tgt_median_hf_days",
            "src_session_count","tgt_session_count",
            "src_lemma_length","tgt_lemma_length",
            "concreteness","frequency"]
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(subset=cols + ["semantic_field"])

    for sf, sub in df.groupby("semantic_field"):
        if len(sub) < 30:
            continue
        r, p = partial_corr_loghf(
            sub["src_median_hf_days"], sub["tgt_median_hf_days"],
            [
                sub["src_session_count"],
                sub["tgt_session_count"],
                sub["src_lemma_length"],
                sub["tgt_lemma_length"],
                sub["concreteness"],
                sub["frequency"]
            ]
        )
        partial_sem_results.append([key, sf, len(sub), r, p])

partial_sem_df = pd.DataFrame(partial_sem_results,
   columns=["pair","semantic_field","n","partial_r_loghf","p"])

display(partial_sem_df)


Unnamed: 0,pair,semantic_field,n,partial_r_loghf,p
0,en_es,act,103,0.193306,0.050419
1,en_es,all,216,0.306103,5e-06
2,en_es,artifact,125,-0.06851,0.447754
3,en_es,attribute,54,-0.042322,0.761235
4,en_es,cognition,93,0.114399,0.274881
5,en_es,communication,109,0.131659,0.17236
6,en_es,group,84,0.33491,0.001846
7,en_es,location,44,0.141869,0.358295
8,en_es,person,76,0.010243,0.930018
9,en_es,state,46,0.145468,0.334742


## 9 — Save to Text Report

In [11]:
report = ""

def add_section(title, df):
    global report
    report += "\n" + "="*80 + "\n" + title + "\n" + "="*80 + "\n"
    report += df.to_string(index=False) + "\n"

add_section("GLOBAL CORRELATIONS", global_df)
add_section("POS-WISE CORRELATIONS", pos_df)
add_section("FREQUENCY-BIN CORRELATIONS", freq_df)
add_section("PARTIAL CORRELATIONS (BINS)", partial_bins_df)
add_section("PARTIAL CORRELATIONS (POS)", partial_pos_df)
add_section("PARTIAL CORRELATIONS (SEMANTIC FIELDS)", partial_sem_df)

with open("RQ1_full_report.txt", "w", encoding="utf-8") as f:
    f.write(report)

print("Saved RQ1_full_report.txt")


Saved RQ1_full_report.txt


## 10 - Top-25 aligned / Top-25 discordant

In [12]:
import pandas as pd
import numpy as np

paths = {
    "en_es": "final_en_es_2.csv",
    "en_it": "final_en_it_2.csv",
    "en_pt": "final_en_pt_2.csv",
}

def load_and_prepare(path):
    df = pd.read_csv(path)

    
    for c in ["src_median_hf_days", "tgt_median_hf_days"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")


    df = df.dropna(subset=["src_lemma", "tgt_lemma", "src_median_hf_days", "tgt_median_hf_days"])

    
    df["src_log_days"] = np.log1p(df["src_median_hf_days"])
    df["tgt_log_days"] = np.log1p(df["tgt_median_hf_days"])

    
    df["log_gap"] = df["src_log_days"] - df["tgt_log_days"]
    df["abs_log_gap"] = df["log_gap"].abs()

    return df


MIN_DAYS_FOR_ALIGNED = 10  

for pair, path in paths.items():
    df = load_and_prepare(path)

    aligned_pool = df[df[["src_median_hf_days", "tgt_median_hf_days"]].min(axis=1) >= MIN_DAYS_FOR_ALIGNED]
    most_aligned = aligned_pool.sort_values("abs_log_gap", ascending=True).head(25)

    least_aligned = df.sort_values("abs_log_gap", ascending=False).head(25)

    cols_out = [
        "src_lemma", "tgt_lemma",
        "src_median_hf_days", "tgt_median_hf_days",
        "src_log_days", "tgt_log_days",
        "log_gap", "abs_log_gap"
    ]

    most_aligned[cols_out].to_csv(f"{pair}_top25_most_aligned.csv", index=False)
    least_aligned[cols_out].to_csv(f"{pair}_top25_least_aligned.csv", index=False)

print("Saved: *_top25_most_aligned.csv and *_top25_least_aligned.csv for en_es, en_it, en_pt")


Saved: *_top25_most_aligned.csv and *_top25_least_aligned.csv for en_es, en_it, en_pt
