In [33]:
import pandas as pd
import numpy as np
import os
import ast
import glob
import re

In [34]:
classifier = "dclm" # "dclm" "nvidia" "edu"

register_name_map = {'HI-IN_HI_dtp_OP': "HI-IN-HI-dtp-OP",
                     'HI-IN_HI_dtp_NA_OP': "HI-IN-HI-dtp-OP-NA",
                     'HI-IN_HI_dtp': "HI-IN-HI-dtp",
                     'HI-IN_HI_dtp_NA_OP_ID':"HI-IN-HI-dtp-OP-NA-ID",
                     'HI-IN_HI_dtp_NA_OP_ID_SP': "HI-IN-HI-dtp-OP-NA-ID-SP",
                     'hplt-v2-dedup': "All Registers (full HPLT v2)",#"HPLT v2",
                     'fineweb': "FineWeb",
                     'HI-IN': "Instructive-Informational (HI-IN)",
                     'HI': "How-to-Instructions (HI)",
                     'ID': "Interactive Discussion (ID)",
                     'IN': "Informational Description (IN)",
                     "IP": "Informational Persuasion (IP)",
                     "LY": "Lyrical (LY)",
                     "MT": "Machine Translation (MT)",
                     "NA": "Narrative (NA)",
                     "OP": "Opinion (OP)",
                     "SP": "Spoken (SP)",
                     "ne": "News (ne)",
                     "dtp": "Description (dtp)"}

# number of billions of tokens available
register_frequencies = {"HI": 100,
                        "ID": 314,
                        "IN": 695,
                        "IP": 421,
                        "LY": 20,
                        "MT": 306,
                        "NA": 545,
                        "OP": 416,
                        "SP": 32,
                        "HI-IN": 70,
                        "dtp": 781,
                        "ne":404}
max_value = np.max([i for i in register_frequencies.values()])
print(max_value)
register_multipliers = {k: float(v)/max_value for k,v in register_frequencies.items()}
print(register_multipliers)

781
{'HI': 0.12804097311139565, 'ID': 0.4020486555697823, 'IN': 0.8898847631241997, 'IP': 0.5390524967989757, 'LY': 0.02560819462227913, 'MT': 0.39180537772087065, 'NA': 0.6978233034571063, 'OP': 0.5326504481434059, 'SP': 0.040973111395646605, 'HI-IN': 0.08962868117797695, 'dtp': 1.0, 'ne': 0.5172855313700384}


In [35]:
datas = []

labels_all_hierarchy_with_other = {
    "MT": ["MT"],
    "LY": ["LY"],
    "SP": ["SP", "it", "os"],
    "ID": ["ID"],
    "NA": ["NA", "ne", "sr", "nb", "on"],
    "HI": ["HI", "re", "oh"],
    "IN": ["IN", "en", "ra", "dtp", "fi", "lt", "oi"],
    "OP": ["OP", "rv", "ob", "rs", "av", "oo"],
    "IP": ["IP", "ds", "ed", "oe"],
}

final_scores = {"MT": 0.374349,
    "LY": 0.357626,
    "SP": 0.422314,
    "ID": 0.430686,
    "NA": 0.441469,
    "HI": 0.446558,
    "IN": 0.437347,
    "OP":  0.446629,
    "IP": 0.393026,
    "HI-IN":0.464502,
    "ne": 0.417933,
    "dtp": 0.452347,
    }

with os.scandir(f"comparison-with-{classifier}/result") as f:
    for file in f:
        filename = file.name
        reg = filename.split("_")[0]
        print(f"reading {filename}")
        with open(f"comparison-with-{classifier}/result/"+filename) as f:
            try:
                data = [ast.literal_eval(line) for line in f]
            except:
                data = []
                for line in f:
                    line = re.sub(r"tensor\((\[.*?\])(?:, device='.*?')?\)", r"\1", line)
                    try:
                        data.append(ast.literal_eval(line))
                    except:
                        print("error:", line)
        df_ = pd.DataFrame(data)
        df_ = df_.drop(columns="text")
        df_["parsed_register"] = df_["register"].map(lambda reg: [r for r in reg if r in list(labels_all_hierarchy_with_other.keys())+["ne", "dtp"]])
        df_["parsed_register"] = df_["parsed_register"].map(lambda reg: "-".join(reg))
        df_["parsed_register"] = df_["parsed_register"].map(lambda reg: {"NA-ne":"ne", "ne-NA": "ne", "IN-dtp":"dtp", "dtp-IN":"dtp", "IN-HI":"HI-IN"}.get(reg, reg))
        df_["final_score"] = df_["parsed_register"].map(lambda reg: final_scores[reg])
        if classifier == "nvidia":
            pred_name = "predicted_class"
            df_["predicted_class"] = df_[pred_name].map(lambda x: int(x[0]))
        elif classifier == "edu":
            pred_name = "int_score"
            df_["predicted_class"] = df_[pred_name].map(lambda x: int(x))
        else:
            pred_name = "dclm-label"
            df_["predicted_class"] = df_[pred_name].map(lambda sc: {"__label__cc": 0, "__label__hq": 1}[sc])

        dropIndices = np.random.choice(df_["parsed_register"].index.tolist(), size = int(df_.shape[0]*(1.0-register_multipliers[reg])))
        df_.drop(dropIndices, inplace=True)
        datas.append(df_)



df = pd.concat(datas)
df["parsed_register"] = df["parsed_register"].map(lambda x: register_name_map[x])
#df.shuffle(inplace=True)
#df.to_csv("analysed_sample")

reading MT_with_edu_and_dclm.jsonl
reading NA_with_edu_and_dclm.jsonl
reading OP_with_edu_and_dclm.jsonl
reading HI_with_edu_and_dclm.jsonl
reading IP_with_edu_and_dclm.jsonl
reading LY_with_edu_and_dclm.jsonl
reading ID_with_edu_and_dclm.jsonl
reading HI-IN_with_edu_and_dclm.jsonl
reading dtp_with_edu_and_dclm.jsonl
reading ne_with_edu_and_dclm.jsonl
reading IN_with_edu_and_dclm.jsonl
reading SP_with_edu_and_dclm.jsonl


In [36]:
display(df)
print(np.unique(df["parsed_register"].tolist(), return_counts=True))

Unnamed: 0,id,register,score,int_score,dclm-label,dclm-prob,parsed_register,final_score,predicted_class
0,aa6e226e50524b13656a48cca50ef88b,[MT],0.226472,0,__label__cc,0.999351,Machine Translation (MT),0.374349,0
1,48ea34d9aa0ce1a6ac031ee3ee950eda,[MT],0.365965,0,__label__cc,0.999481,Machine Translation (MT),0.374349,0
3,820661e03daf8d1268772cd7a0c84856,[MT],-0.114357,0,__label__cc,0.997457,Machine Translation (MT),0.374349,0
7,eeede6967fa76fe14b7256e356ce9987,[MT],0.405946,0,__label__cc,0.999985,Machine Translation (MT),0.374349,0
10,76c02530f8d8b820c0a78c04d162f543,[MT],-0.157198,0,__label__cc,0.995654,Machine Translation (MT),0.374349,0
...,...,...,...,...,...,...,...,...,...
9981,1f784ce1adf91ed89a340f8a53465e3b,"[SP, it]",0.684884,1,__label__cc,0.994901,Spoken (SP),0.422314,0
9982,2255339f8dd8c02a30a5fc76757bbd14,"[SP, it]",1.215477,1,__label__cc,0.992300,Spoken (SP),0.422314,0
9992,018dfe3ebb37c9ba098fdb4e1ab6cba2,"[it, SP]",1.897866,2,__label__cc,0.919173,Spoken (SP),0.422314,0
9997,12b07b501cef77656fa8df40c38223eb,"[it, SP]",0.671556,1,__label__cc,0.931684,Spoken (SP),0.422314,0


(array(['Description (dtp)', 'How-to-Instructions (HI)',
       'Informational Description (IN)', 'Informational Persuasion (IP)',
       'Instructive-Informational (HI-IN)', 'Interactive Discussion (ID)',
       'Lyrical (LY)', 'Machine Translation (MT)', 'Narrative (NA)',
       'News (ne)', 'Opinion (OP)', 'Spoken (SP)'], dtype='<U33'), array([10000,  4202,  8957,  6353,  4000,  5479,  3775,  5452,  7385,
        6173,  6257,  3853]))


In [37]:
def cooccurrence_matrix(df):
    regs = np.unique(df["parsed_register"]).tolist()
    gens = np.unique(df["predicted_class"]).tolist()
    print("regs: ", regs)
    print("gens: ", gens)

    print(range(len(regs)))
    reg2index = {k:v for k,v in zip(regs, range(len(regs)))}
    gen2index = {k:v for k,v in zip(gens, range(len(gens)))}

    m = np.zeros((len(regs), len(gens)), dtype=np.float32)
    summa = 0
    for i,d in df.iterrows():
        r = d["parsed_register"]
        g = d["predicted_class"]

        m[reg2index[r],gen2index[g]] += 1
        summa+=1

    return m/summa


m = cooccurrence_matrix(df)

assert 0.999 < sum(m.sum(axis=0)) < 1.001 and 0.999 < sum(m.sum(axis=1)) < 1.001

regs:  ['Description (dtp)', 'How-to-Instructions (HI)', 'Informational Description (IN)', 'Informational Persuasion (IP)', 'Instructive-Informational (HI-IN)', 'Interactive Discussion (ID)', 'Lyrical (LY)', 'Machine Translation (MT)', 'Narrative (NA)', 'News (ne)', 'Opinion (OP)', 'Spoken (SP)']
gens:  [0, 1]
range(0, 12)


In [38]:
print(sum(m.sum(axis=0)))
print(sum(m.sum(axis=1)))

0.9999999999999999
0.9999999999999999


In [39]:
def entropy(m):
    p_gen = m.sum(axis=0)
    p_reg = m.sum(axis=1)

    def _entropy_1d(v):
        return -1*sum(v*np.log2(v))

    def _entropy_2d(m):
        ent = 0
        for j in m:
            for k in j:
                if k != 0:
                    ent += k*np.log2(k)
        return -1*ent

    ent_reg = _entropy_1d(p_reg)
    ent_gen = _entropy_1d(p_gen)
    ent_reg_gen = _entropy_2d(m)

    return ent_reg, ent_gen, ent_reg_gen


def mutual_info(m):
    reg_pdf = m.sum(axis=1)
    gen_pdf = m.sum(axis=0)

    info = 0
    for i in range(len(reg_pdf)):
        for j in range(len(gen_pdf)):
            if m[i,j]!= 0:
                info += m[i,j]*np.log(m[i,j]/(reg_pdf[i]*gen_pdf[j]))
    return info


print(entropy(m))
print(mutual_info(m))

(3.5136822363338065, 0.18562495994670153, 3.686409255301159)
0.008940171424864083


In [40]:
from scipy.stats import chi2_contingency

def cramers_v(df):
    x = df["parsed_register"]
    y = df["predicted_class"]

    confusion_matrix = pd.crosstab(x, y)
    print(confusion_matrix)
    chi2 = chi2_contingency(confusion_matrix)[0]
    print(f"chi2 p value {chi2_contingency(confusion_matrix)[1]}")
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cramers_v = cramers_v(df)
print(cramers_v)

predicted_class                       0    1
parsed_register                             
Description (dtp)                  9810  190
How-to-Instructions (HI)           4119   83
Informational Description (IN)     8576  381
Informational Persuasion (IP)      6311   42
Instructive-Informational (HI-IN)  3813  187
Interactive Discussion (ID)        5079  400
Lyrical (LY)                       3452  323
Machine Translation (MT)           5386   66
Narrative (NA)                     7334   51
News (ne)                          6131   42
Opinion (OP)                       6123  134
Spoken (SP)                        3720  133
chi2 p value 3.7556970004068413e-296
0.13958547639233554


In [41]:
def cramers_v_cumulative(df):
    x = df["parsed_register"]
    y = df["predicted_class"]

    confusion_matrix = pd.crosstab(x, y)
    confusion_matrix = np.cumsum(confusion_matrix, axis = 1)
    print(confusion_matrix)
    chi2 = chi2_contingency(confusion_matrix)[0]
    print(f"chi2 p value {chi2_contingency(confusion_matrix)[1]}")
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cramers_v_cum = cramers_v_cumulative(df)
print(cramers_v_cum)

predicted_class                       0      1
parsed_register                               
Description (dtp)                  9810  10000
How-to-Instructions (HI)           4119   4202
Informational Description (IN)     8576   8957
Informational Persuasion (IP)      6311   6353
Instructive-Informational (HI-IN)  3813   4000
Interactive Discussion (ID)        5079   5479
Lyrical (LY)                       3452   3775
Machine Translation (MT)           5386   5452
Narrative (NA)                     7334   7385
News (ne)                          6131   6173
Opinion (OP)                       6123   6257
Spoken (SP)                        3720   3853
chi2 p value 0.03874182574932648
0.008193881385837275


In [42]:
! pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [43]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming 'nominal_var' is your nominal column and 'ordinal_var' is numeric (1 to 5)
model = ols('predicted_class ~ C(parsed_register)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
eta_squared = anova_table['sum_sq']['C(parsed_register)'] / anova_table['sum_sq'].sum()
print(eta_squared)

0.019636856362400423


In [44]:
! pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [45]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Encode feature1
df['feature1_encoded'] = le.fit_transform(df['parsed_register'])

# Encode feature2
df['feature2_encoded'] = le.fit_transform(df['predicted_class'])

# Extract the encoded features
X = df[['feature1_encoded']]
y = df['feature2_encoded']

# Calculate mutual information
mi = mutual_info_classif(X, y, discrete_features=True)

print(f'Mutual Information between feature1 and feature2: {mi[0]}')

Mutual Information between feature1 and feature2: 0.008940171424864165


In [46]:
! pip install plotly
! pip install nbformat>=4.2.0

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [47]:
import pandas as pd
import plotly.graph_objects as go

# Example: sample data
# df = pd.DataFrame({
#     'parsed_register': ['A', 'B', 'A', 'C', 'B', 'C', 'A', 'B', 'C', 'A'],
#     'int_score': [1, 2, 2, 3, 3, 4, 1, 5, 4, 2]
# })

# Count pairs (grouped by nominal and ordinal labels)
df['predicted_class'] = df['predicted_class'].astype(str)  # make sure it's a string for plotting
counts = df.groupby(['parsed_register', 'predicted_class']).size().reset_index(name='count')

# Get all unique labels
all_labels = list(pd.unique(counts['parsed_register'].tolist() + counts['predicted_class'].tolist()))
label_to_index = {label: i for i, label in enumerate(all_labels)}

# Build Sankey inputs
source_indices = counts['parsed_register'].map(label_to_index)
target_indices = counts['predicted_class'].map(label_to_index)
values = counts['count']

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=10,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_labels,
    ),
    link=dict(
        source=source_indices,
        target=target_indices,
        value=values
    )
)])

#model_name_map = dict(edu="FineWeb-edu classifier", nvidia="NVIDIA NemoCurator", dclm ="DCLM classifier")
fig.update_layout(title_text="",font_size=16, width=900, margin=dict(l=20, r=20, t=20, b=20))#f"Registers against {model_name_map[classifier]} score", )
fig.write_image(f"sankey_{classifier}.svg", scale=3)
fig.show()


In [48]:
# final printing

print(f"Classifier: {classifier}")
print(f"Cramer's v: {cramers_v}")
print(f"Eta2: {eta_squared}")
print(f"Mutual information: {mi[0]}")
print(f"Cumulative Cramer's v: {cramers_v_cum}")



Classifier: dclm
Cramer's v: 0.13958547639233554
Eta2: 0.019636856362400423
Mutual information: 0.008940171424864165
Cumulative Cramer's v: 0.008193881385837275
