In [38]:
import json
import random

input_file = "../dataset/ca_test_data_final_OFFICIAL.jsonl"
output_file = "../experiments/v2/dataset/ca_test_data_final_OFFICIAL_with_fkgl.jsonl"
dataset = [json.loads(line.strip()) for line in open(input_file)]
print(dataset[0]['summary'])

Existing property tax law establishes a veterans’ organization exemption under which property is exempt from taxation if, among other things, that property is used exclusively for charitable purposes and is owned by a veterans’ organization. This bill would provide that the veterans’ organization exemption shall not be denied to a property on the basis that the property is used for fraternal, lodge, or social club purposes, and would make specific findings and declarations in that regard. The bill would also provide that the exemption shall not apply to any portion of a property that consists of a bar where alcoholic beverages are served. Section 2229 of the Revenue and Taxation Code requires the Legislature to reimburse local agencies annually for certain property tax revenues lost as a result of any exemption or classification of property for purposes of ad valorem property taxation. This bill would provide that, notwithstanding Section 2229 of the Revenue and Taxation Code, no appro

In [39]:
## Add CEFR labels using and existing CEFR classifier

from transformers import pipeline
from tqdm import tqdm

# 1. Initialize the pipeline
classifier = pipeline(
    "text-classification", 
    model="AbdullahBarayan/ModernBERT-base-doc_en-Cefr",
    device=0,
    top_k=None,
)

# 2. Define your texts
texts = [instance['summary'] for instance in dataset]

# 3. Run batch inference
# Setting batch_size here helps with memory management for large datasets
results = classifier(texts, batch_size=16)

assert len(results) == len(dataset)

# 4. Inspect results
for idx in tqdm(range(len(dataset))):
    text_result = results[idx]
    dataset[idx]['cefr_labels'] = text_result
    

In [40]:
## Add FKGL readability levels to the dataset
import textstat
from collections import Counter
from tqdm import tqdm

def extract_label(score):
    if score < 6:
        return "beginner"
    elif 6 <= score <= 12:
        return "intermediate"
    else:
        return "advanced"

texts = [instance['text'] for instance in dataset]
levels = [extract_label(textstat.flesch_kincaid_grade(instance['summary'])) for instance in dataset]
assert len(levels) == len(texts)

print(Counter(levels))

for idx in tqdm(range(len(dataset))):
    text_result = levels[idx]
    dataset[idx]['level'] = text_result
    


Counter({'advanced': 1225, 'intermediate': 12})


100%|██████████| 1237/1237 [00:00<00:00, 1767752.66it/s]


In [41]:
with open(output_file, "w") as fp:
    for instance in dataset:
        fp.write(json.dumps(instance).strip() + "\n")