In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_error_analysis.csv')
df

Unnamed: 0.1,Unnamed: 0,pmid,start,end,tag,text,source,correct,error_type,annotator
0,0,58651,26,42,Group Characteristic,renal transplant,model,y,,ccurtis
1,1,58651,144,149,Drug Intervention,HAHTG,human,y,,ccurtis
2,2,58651,165,175,Non-Study Drug,prednisone,human,y,wrong label,ccurtis
3,3,58651,180,192,Non-Study Drug,azathioprine,human,y,wrong label,ccurtis
4,4,58651,194,200,Drug Intervention,lmuran,model,n,wrong label,ccurtis
...,...,...,...,...,...,...,...,...,...,...
5561,5561,36206137,2226,2228,Quantitative Measurement,53,human,n,too little,eduvaris
5562,5562,36206137,2226,2230,Quantitative Measurement,53 %,model,y,,eduvaris
5563,5563,36206137,2235,2237,Quantitative Measurement,47,human,n,too little,eduvaris
5564,5564,36206137,2235,2239,Quantitative Measurement,47 %,model,y,,eduvaris


Total Number of tagged mentions

In [5]:
df_model = df[df['source'] == 'model']
df_human = df[df['source'] == 'human']
print("Number of model tagged mentions :", len(df_model))
print("Number of human tagged mentions :", len(df_human))

Number of model tagged mentions : 3103
Number of human tagged mentions : 2463


Number of pmids

In [15]:
model_unique_pmids = df_model['pmid'].nunique()
human_unique_pmids = df_human['pmid'].nunique()
print(f"Model unique PMIDs: {model_unique_pmids}")
print(f"Human unique PMIDs: {human_unique_pmids}")

Model unique PMIDs: 237
Human unique PMIDs: 236


Different pmids

In [29]:
pmid_difference1 = set(df_model['pmid']) - set(df_human['pmid'])
pmid_difference2 = set(df_human['pmid']) - set(df_model['pmid'])
pmid_difference = pmid_difference1.union(pmid_difference2)
print("The pmids that are not in common are :", pmid_difference)
print("Model was evaluated on", pmid_difference1, " that human was not evaluated on")
print("Human was evaluated on", pmid_difference2, " that model was not evaluated on")

The pmids that are not in common are : {8151609, 2719459, 3582605}
Model was evaluated on {8151609, 3582605}  that human was not evaluated on
Human was evaluated on {2719459}  that model was not evaluated on


Different error types

In [9]:
set(df_model['error_type'])

{'multiple entities',
 nan,
 'not capturable',
 'not needed',
 'too little',
 'too little or too much based on context',
 'too much',
 'wrong label'}

Number of correctly labeled mentions

In [35]:
model_correct_label_count = df_model['error_type'].isna().sum() - (df_model[(df_model['correct'] == 'n') & (df_model['error_type'].isna())].shape[0])
human_correct_label_count = df_human['error_type'].isna().sum() - (df_human[(df_human['correct'] == 'n') & (df_human['error_type'].isna())].shape[0])
print(f"Number of correctly labeled mentions by the model: {model_correct_label_count}, {model_correct_label_count/len(df_model)*100}%")
print(f"Number of correctly labeled mentions by the human: {human_correct_label_count}, {human_correct_label_count/len(df_human)*100}%")

Number of correctly labeled mentions by the model: 2218, 71.47921366419594%
Number of correctly labeled mentions by the human: 1240, 50.345107592367036%


Number of uncorrectly labeled mentions

In [36]:
model_uncorrect_label_count = df_model['error_type'].notna().sum() + (df_model[(df_model['correct'] == 'n') & (df_model['error_type'].isna())].shape[0])
human_uncorrect_label_count = df_human['error_type'].notna().sum() + (df_human[(df_human['correct'] == 'n') & (df_human['error_type'].isna())].shape[0])
print(f"Number of correctly tagged mentions but uncorrectly labeled by the model: {model_uncorrect_label_count}, {model_uncorrect_label_count/len(df_model)*100}%")
print(f"Number of correctly tagged mentions but uncorrectly labeled by the human: {human_uncorrect_label_count}, {human_uncorrect_label_count/len(df_human)*100}%")

Number of correctly tagged mentions but uncorrectly labeled by the model: 885, 28.52078633580406%
Number of correctly tagged mentions but uncorrectly labeled by the human: 1223, 49.654892407632964%


Type of error : wrong label / too much / too little / multiple entities / not needed (= shouldn't have been tagged)

In [54]:
model_wrong_label_count = df_model[df_model['error_type'] == 'wrong label'].shape[0]
human_wrong_label_count = df_human[df_human['error_type'] == 'wrong label'].shape[0]
print(f"Number of mentions that got 'wrong label' from the model: {model_wrong_label_count}, {model_wrong_label_count/model_uncorrect_label_count*100}%")
print(f"Number of mentions that got 'wrong label' from the human: {human_wrong_label_count}, {human_wrong_label_count/human_uncorrect_label_count*100}%")

model_too_little_count = df_model[df_model['error_type'] == 'too little'].shape[0]
human_too_little_count = df_human[df_human['error_type'] == 'too little'].shape[0]
print(f"Number of mentions that got 'too little' from the model: {model_too_little_count}, {model_too_little_count/model_uncorrect_label_count*100}%")
print(f"Number of mentions that got 'too little' from the human: {human_too_little_count}, {human_too_little_count/human_uncorrect_label_count*100}%")

model_not_needed_tag_count = df_model[(df_model['error_type'] == 'not needed')].shape[0]
human_not_needed_tag_count = df_human[(df_human['error_type'] == 'not needed')].shape[0]
print(f"Number of mentions that got 'not needed' by the model: {model_not_needed_tag_count}, {model_not_needed_tag_count/model_uncorrect_label_count*100}%")
print(f"Number of mentions that got 'not needed' by the human: {human_not_needed_tag_count}, {human_not_needed_tag_count/human_uncorrect_label_count*100}%")

model_too_much_count = df_model[df_model['error_type'] == 'too much'].shape[0]
human_too_much_count = df_human[df_human['error_type'] == 'too much'].shape[0]
print(f"Number of mentions that got 'too much' from the model: {model_too_much_count}, {model_too_much_count/model_uncorrect_label_count*100}%")
print(f"Number of mentions that got 'too much' from the human: {human_too_much_count}, {human_too_much_count/human_uncorrect_label_count*100}%")

model_multiple_entities_count = df_model[df_model['error_type'] == 'multiple entities'].shape[0]
human_multiple_entities_count = df_human[df_human['error_type'] == 'multiple entities'].shape[0]
print(f"Number of mentions that got 'multiple entities' from the model: {model_multiple_entities_count}, {model_multiple_entities_count/model_uncorrect_label_count*100}%")
print(f"Number of mentions that got 'multiple entities' from the human: {human_multiple_entities_count}, {human_multiple_entities_count/human_uncorrect_label_count*100}%")

model_little_much_count = df_model[df_model['error_type'] == 'too little or too much based on context'].shape[0]
human_little_much_count = df_human[df_human['error_type'] == 'too little or too much based on context'].shape[0]
print(f"Number of mentions that got 'too little or too much based on context' from the model: {model_little_much_count}, {model_little_much_count/model_uncorrect_label_count*100}%")
print(f"Number of mentions that got 'too little or too much based on context' from the human: {human_little_much_count}, {human_little_much_count/human_uncorrect_label_count*100}%")

model_empty_count = (df_model[(df_model['correct'] == 'n') & (df_model['error_type'].isna())].shape[0])
human_empty_count = (df_human[(df_human['correct'] == 'n') & (df_human['error_type'].isna())].shape[0])
print(f"Number of mentions that shouldn't have been tagged by the model: {model_empty_count}, {model_empty_count/model_uncorrect_label_count*100}%")
print(f"Number of mentions that shouldn't have been tagged by the human: {human_empty_count}, {human_empty_count/human_uncorrect_label_count*100}%")

Number of mentions that got 'wrong label' from the model: 302, 34.12429378531073%
Number of mentions that got 'wrong label' from the human: 481, 39.329517579721994%
Number of mentions that got 'too little' from the model: 239, 27.005649717514125%
Number of mentions that got 'too little' from the human: 279, 22.812755519215045%
Number of mentions that got 'not needed' by the model: 141, 15.932203389830507%
Number of mentions that got 'not needed' by the human: 129, 10.54783319705642%
Number of mentions that got 'too much' from the model: 129, 14.576271186440678%
Number of mentions that got 'too much' from the human: 219, 17.906786590351594%
Number of mentions that got 'multiple entities' from the model: 41, 4.632768361581921%
Number of mentions that got 'multiple entities' from the human: 79, 6.459525756336877%
Number of mentions that got 'too little or too much based on context' from the model: 14, 1.5819209039548021%
Number of mentions that got 'too little or too much based on context

In [51]:
print(model_not_needed_tag_count + model_empty_count + model_little_much_count + model_multiple_entities_count + model_too_little_count + model_too_much_count + model_wrong_label_count)
print(model_uncorrect_label_count)
print(human_little_much_count + human_multiple_entities_count + human_too_little_count + human_too_much_count + human_wrong_label_count + human_uncorrect_tag_count)
print(human_uncorrect_label_count)


884
885
1218
1223


In [55]:
print(df_model['error_type'].value_counts().sum())
df_model['error_type'].value_counts()

867


wrong label                                302
too little                                 239
not needed                                 141
too much                                   129
multiple entities                           41
too little or too much based on context     14
not capturable                               1
Name: error_type, dtype: int64

In [48]:
df_human['error_type'].value_counts()

wrong label                                       481
too little                                        279
too much                                          219
not needed                                        129
multiple entities                                  79
too little or too much based on context             8
too much or multiple entities based on context      3
not necessary/wrong label                           2
Name: error_type, dtype: int64

# AlpacaEval : Using LLM as evaluator

In [1]:
import numpy as np
import math

In [2]:
sigmoid = lambda x: 1 / (1 + (np.exp(-x)))
f1 = lambda x1, x2 : np.cos(x1) * np.cos(x2) + sigmoid(x2)
f2 = lambda x1, x2 : np.log(x1 + x2) + x1**2 * x2 

print(f1(1,2))
print(f2(1,2))

0.6559519826117296

In [4]:
# Forward
w1, w2 = 1, 2
dw1, dw2 = 0.01, 0.01
df1_dw1 = (f1(w1+dw1, w2) - f1(w1, w2))/dw1
df1_dw2 = (f1(w1, w2+dw2) - f1(w1, w2))/dw2

df2_dw1 = (f2(w1+dw1, w2) - f2(w1, w2))/dw1
df2_dw2 = (f2(w1, w2+dw2) - f2(w1, w2))/dw2

print(f"{df1_dw1=}, {df1_dw2=}, {df2_dw1=}, {df2_dw2=}")

df1_dw1=0.3512938682533773, df1_dw2=-0.38556867082343294, df2_dw1=4.352779009267449, df2_dw2=1.3327790092674707


In [12]:
df1_dw1 = lambda x1, x2 : - np.sin(x1) * np.cos(x2)
df1_dw2 = lambda x1, x2 : - np.cos(x1) * np.sin(x2) + sigmoid(x2) * (1 - sigmoid(x2))
df2_dw1 = lambda x1, x2 : 1/(x1 + x2) + 2*x1*x2
df2_dw2 = lambda x1, x2 : 1/(x1 + x2) + x1**2

print(f"{df1_dw1(1,2)=}, {df1_dw2(1,2)=}, {df2_dw1(1,2)=}, {df2_dw2(1,2)=}")

df1_dw1(1,2)=0.35017548837401463, df1_dw2(1,2)=Array(-0.38630188, dtype=float32), df2_dw1(1,2)=4.333333333333333, df2_dw2(1,2)=1.3333333333333333


In [5]:
import numpy as np

# Sigmoid function
sigmoid = lambda x: 1 / (1 + np.exp(-x))

# Function definitions
def f1(w1, w2):
    return np.cos(w1) * np.cos(w2) + sigmoid(w2)

def f2(w1, w2):
    return np.log(w1 + w2) + w1**2 * w2

# Vector function f = [f1, f2]
def f(w):
    w1, w2 = w
    return np.array([f1(w1, w2), f2(w1, w2)])

# Numerical differentiation to compute Jacobian
def numerical_jacobian(f, w, delta_w=0.01):
    jacobian = np.zeros((len(f(w)), len(w)))
    for i in range(len(w)):
        w_plus = w.copy()
        w_plus[i] += delta_w
        derivative = (f(w_plus) - f(w)) / delta_w
        jacobian[:, i] = derivative
    return jacobian

# Given w = (1, 2)
w = np.array([1.0, 2.0])
jacobian = numerical_jacobian(f, w)

print("Jacobian at w = (1, 2):")
print(jacobian)


Jacobian at w = (1, 2):
[[ 0.35129387 -0.38556867]
 [ 4.35277901  1.33277901]]


In [11]:
import jax
import jax.numpy as jnp

# Define the sigmoid function
sigmoid = lambda x: 1 / (1 + jnp.exp(-x))

# Define the functions f1 and f2
def f1(w):
    w1, w2 = w
    return jnp.cos(w1) * jnp.cos(w2) + sigmoid(w2)

def f2(w):
    w1, w2 = w
    return jnp.log(w1 + w2) + w1**2 * w2

# Combine f1 and f2 into a vector function f
def f(w):
    return jnp.array([f1(w), f2(w)])

# Compute the Jacobian using JAX's jacfwd (forward-mode differentiation)
w = jnp.array([1.0, 2.0])
jacobian_fwd = jax.jacfwd(f)(w)

print("Jacobian forward at w = (1, 2):")
print(jacobian_fwd)

Jacobian forward at w = (1, 2):
[[ 0.35017547 -0.38630188]
 [ 4.3333335   1.3333334 ]]


In [10]:
# Define the sigmoid function
sigmoid = lambda x: 1 / (1 + jnp.exp(-x))

# Define the functions f1 and f2
def f1(w):
    w1, w2 = w
    return jnp.cos(w1) * jnp.cos(w2) + sigmoid(w2)

def f2(w):
    w1, w2 = w
    return jnp.log(w1 + w2) + w1**2 * w2

# Combine f1 and f2 into a vector function f
def f(w):
    return jnp.array([f1(w), f2(w)])

# Compute the Jacobian using JAX's jacrev (reverse-mode differentiation)
w = jnp.array([1.0, 2.0])
jacobian_rev = jax.jacrev(f)(w)
print("Jacobian backward at w = (1, 2):")
print(jacobian_rev)

Jacobian backward at w = (1, 2):
[[ 0.35017547 -0.38630188]
 [ 4.3333335   1.3333334 ]]
