In [2]:
import json
with open("candidates.json") as f:
	data = json.load(f)

data[0]

{'method': 'tool_variable_dataset_gpt-4o',
 'text_id': 0,
 'annotation': 'susceptibles (S)',
 'ann_type': 'var desc',
 'prediction': '1. S susceptibles'}

In [5]:
import pandas as pd

frame = pd.DataFrame([{k:d[k] for k in {'annotation', 'ann_type'}} for d in data])
frame.drop_duplicates(inplace=True)
frame

Unnamed: 0,ann_type,annotation
0,var desc,susceptibles (S)
5,var desc,dead (D)
10,var desc,"hospitalized and ICU admitted (H),"
15,var desc,recovered (R)
20,var desc,"confirmed infections (I),"
...,...,...
70495,var val,the effective daily reproduction ratio has fal...
71003,var val,effective daily reproduction ratio has already...
141744,var val,"R0, but still higher than 1"
167420,var val,(cid:31)R ¼ 0


In [33]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")



In [29]:
tokenized = tokenizer(frame['annotation'].to_list(), add_special_tokens=False)['input_ids']
tokenized

[[18002, 2015, 1006, 1055, 1007],
 [2757, 1006, 1040, 1007],
 [24735, 1998, 24582, 2226, 4914, 1006, 1044, 1007, 1010],
 [6757, 1006, 1054, 1007],
 [4484, 15245, 1006, 1045, 1007, 1010],
 [2177,
  1015,
  1024,
  5721,
  2007,
  13586,
  2007,
  2062,
  2084,
  2753,
  1010,
  2199,
  5467,
  2566,
  2095],
 [2177, 1016, 1024, 5721, 4193, 1996, 5721, 2007, 13586],
 [2177, 1017, 1024, 5721, 2007, 2364, 10292, 5153, 1996, 2221],
 [2177,
  1018,
  1024,
  5721,
  2025,
  4193,
  5721,
  2007,
  13586,
  2030,
  2108,
  4625,
  2011,
  2364,
  10292],
 [2322, 1010, 6146, 2509, 4484, 3572],
 [1018,
  1010,
  27003,
  1006,
  5345,
  1003,
  13675,
  2140,
  1017,
  1010,
  3770,
  2581,
  1516,
  1019,
  1010,
  6109,
  2581,
  1007,
  23260,
  2902,
  22318],
 [1015,
  1010,
  4029,
  2475,
  1006,
  5345,
  1003,
  18856,
  2099,
  1015,
  1010,
  15376,
  1516,
  1015,
  1010,
  6445,
  2629,
  1007,
  23260,
  24582,
  2226,
  20247],
 [1015,
  1010,
  4720,
  2683,
  1006,
  5345,
  10

In [30]:
tokenized = [tokenizer.convert_ids_to_tokens(ids) for ids in tokenized]
tokenized

[['susceptible', '##s', '(', 's', ')'],
 ['dead', '(', 'd', ')'],
 ['hospitalized', 'and', 'ic', '##u', 'admitted', '(', 'h', ')', ','],
 ['recovered', '(', 'r', ')'],
 ['confirmed', 'infections', '(', 'i', ')', ','],
 ['group',
  '1',
  ':',
  'counties',
  'with',
  'airports',
  'with',
  'more',
  'than',
  '50',
  ',',
  '000',
  'passengers',
  'per',
  'year'],
 ['group',
  '2',
  ':',
  'counties',
  'surrounding',
  'the',
  'counties',
  'with',
  'airports'],
 ['group',
  '3',
  ':',
  'counties',
  'with',
  'main',
  'highways',
  'crossing',
  'the',
  'county'],
 ['group',
  '4',
  ':',
  'counties',
  'not',
  'surrounding',
  'counties',
  'with',
  'airports',
  'or',
  'being',
  'crossed',
  'by',
  'main',
  'highways'],
 ['20', ',', '76', '##3', 'confirmed', 'cases'],
 ['4',
  ',',
  '318',
  '(',
  '95',
  '%',
  'cr',
  '##l',
  '3',
  ',',
  '80',
  '##7',
  '–',
  '5',
  ',',
  '93',
  '##7',
  ')',
  'cumulative',
  'hospital',
  '##izations'],
 ['1',
  ',',


In [31]:
frame['tokens'] = tokenized
frame['num_tokens'] = frame.tokens.apply(len)
frame

Unnamed: 0,ann_type,annotation,tokens,num_tokens
0,var desc,susceptibles (S),"[susceptible, ##s, (, s, )]",5
5,var desc,dead (D),"[dead, (, d, )]",4
10,var desc,"hospitalized and ICU admitted (H),","[hospitalized, and, ic, ##u, admitted, (, h, )...",9
15,var desc,recovered (R),"[recovered, (, r, )]",4
20,var desc,"confirmed infections (I),","[confirmed, infections, (, i, ), ,]",6
...,...,...,...,...
70495,var val,the effective daily reproduction ratio has fal...,"[the, effective, daily, reproduction, ratio, h...",9
71003,var val,effective daily reproduction ratio has already...,"[effective, daily, reproduction, ratio, has, a...",9
141744,var val,"R0, but still higher than 1","[r, ##0, ,, but, still, higher, than, 1]",8
167420,var val,(cid:31)R ¼ 0,"[(, cid, :, 31, ), r, ¼, 0]",8


In [32]:
frame.groupby('ann_type')['num_tokens'].mean()

ann_type
var desc    9.712012
var val     8.732653
Name: num_tokens, dtype: float64