In [1]:
import pandas as pd
from gliner import GLiNER
import spacy
from tqdm.notebook import tqdm
import pickle

In [2]:
custom_spacy_config = { "gliner_model": "urchade/gliner_small-v2.1",
                            "chunk_size": 250,
                            "labels": ["people","company","organization","location","date","money"],
                            "style": "dep",
                            "threshold": 0.5,
                            "map_location": "mps"}
nlp = spacy.blank("en")
nlp.add_pipe("gliner_spacy", config=custom_spacy_config)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



<gliner_spacy.pipeline.GlinerSpacy at 0x11a734560>

In [3]:
df = pd.read_csv('Global-Cases-Export-2024-09-25.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')
# split at jurisdictions to jurisdictions_country, jurisdictions_region, jurisdictions_type given the > separator. Eg: 'Australia>New South Wales>Land and Environment Court'
# Note: not all rows have all 3 values
df[['jurisdictions_country', 'jurisdictions_region', 'jurisdictions_type']] = df['jurisdictions'].str.split('>',n=2, expand=True)
df.head()

Unnamed: 0,case_name,id,permalink,summary,case_categories,jurisdictions,principal_laws,status,reporter_info,filing_year_for_action,core_object,jurisdictions_country,jurisdictions_region,jurisdictions_type
0,Aldous v. Greater Taree City Council and Another,2204,https://climatecasechart.com/non-us-case/aldou...,An Australian court upheld approval of a devel...,Suits against governments>Environmental assess...,Australia>New South Wales>Land and Environment...,Australia,,Land and Environment Court of New South Wales ...,2009.0,,Australia,New South Wales,Land and Environment Court
1,Ground Crew at Turramurra v. Ku-ring-gai Council,2504,https://climatecasechart.com/non-us-case/groun...,Applicant appealed the denial of a permit for ...,Suits against governments>Environmental assess...,Australia>New South Wales>Land and Environment...,Australia>Environmental Planning and Assessmen...,Appeal dismissed,[2008] NSWLEC 86 (Australia),2008.0,Appeal of denial of a planning permit due to r...,Australia,New South Wales,Land and Environment Court
2,Haughton v. Minister for Department of Plannin...,2505,https://climatecasechart.com/non-us-case/haugh...,Ned Haughton challenged the approval granted b...,Suits against governments>Environmental assess...,Australia>New South Wales>Land and Environment...,Australia>Precautionary Principle|Australia>Pr...,Application dismissed,[2011] NSWLEC 217 (Australia),2011.0,Challenge to government approval of two coal-f...,Australia,New South Wales,Land and Environment Court
3,Able Lott Holdings Pty. Ltd. v. City of Fremantle,2508,https://climatecasechart.com/non-us-case/able-...,This case concerned a development application ...,Suits against governments>Environmental assess...,Australia>Western Australia>State Administrati...,Australia>State Coastal Planning Policy (Weste...,Application dismissed,[2010] WASAT 117 (Australia),2010.0,Challenge to approve development of site despi...,Australia,Western Australia,State Administrative Tribunal
4,Aldous v. Greater Taree City Council and Another,2510,https://climatecasechart.com/non-us-case/aldou...,An Australian court upheld approval of a deve...,,Australia>New South Wales>Land and Environment...,Australia,Application dismissed,[ 2009 ] NCWELC 17 (Australia),2009.0,Challenge to city council decision granting de...,Australia,New South Wales,Land and Environment Court


In [29]:
def split_text_to_chunks(text, chunk_size=250):
    return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]


chunk_nodes = []
chunk_id = 1
for index, row in df.iterrows():
    text_chunks = split_text_to_chunks(row["summary"], 250)
    doc_id = row["id"]
    for chunk in text_chunks:
        chunk_nodes.append({"id": chunk_id, "doc_id": doc_id, "text": chunk})
        chunk_id += 1

# for chunk in chunk_nodes:
#     print(chunk)
chunk_nodes


def get_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append(
            {
                "text": ent.text,
                "start_char": ent.start_char,
                "end_char": ent.end_char,
                "label": ent.label_,
            }
        )
    return entities


entity_nodes = []
entity_id = 1
for chunk in tqdm(chunk_nodes):
    entities = get_entities(chunk["text"])
    for entity in entities:
        entity_nodes.append(
            {
                "id": entity_id,
                "chunk_id": chunk["id"],
                "text": entity["text"],
                "start_char": entity["start_char"],
                "end_char": entity["end_char"],
                "label": entity["label"],
            }
        )

  0%|          | 0/8694 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# save the chunk and entities to pickle
pickle.dump(chunk_nodes, open("chunk_nodes.pkl", "wb"))
pickle.dump(entity_nodes, open("entity_nodes.pkl", "wb"))