In [2]:
from gliner import GLiNER


def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity["label"] == current["label"] and (
            next_entity["start"] == current["end"] + 1
            or next_entity["start"] == current["end"]
        ):
            current["text"] = text[current["start"] : next_entity["end"]].strip()
            current["end"] = next_entity["end"]
        else:
            merged.append(current)
            current = next_entity
    # Append the last entity
    merged.append(current)
    return merged


# model = GLiNER.from_pretrained("numind/NuNerZero")
model = GLiNER.from_pretrained("numind/NuZero_token")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

zero_shot_performance_unzero_token.png:   0%|          | 0.00/43.1k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

NuZero_token_token_metrics.txt:   0%|          | 0.00/961 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.80G [00:00<?, ?B/s]



In [3]:
# NuZero requires labels to be lower-cased!
labels = ["location", "date", "person", "event", "company", "organization", "position"]
labels = [l.lower() for l in labels]

text = """Fiat has completed its buyout of Chrysler, making the U.S. business a wholly-owned subsidiary of the Italian
carmaker as it gears up to use their combined resources to turn around its loss-making operations in
Europe. The company announced on January 1 that it had struck a $4.35 billion deal - cheaper than analysts
had expected - to gain full control of Chrysler, ending more than a year of tense talks that had obstructed Chief Executive Sergio Marchionne's efforts to create the
world's seventh-largest auto maker."""

entities = model.predict_entities(text, labels, threshold=0.4)

entities = merge_entities(entities)

for entity in entities:
    print(entity["text"], "=>", entity["label"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fiat => organization
Chrysler => company
U.S. => location
Italian => location
Europe => location
January 1 => date
Chrysler => company
Chief Executive => position
Sergio Marchionne => person


In [5]:
import wikipedia
from tqdm import tqdm

In [6]:
page = wikipedia.page(title="Tom Hanks", auto_suggest=False)
page.content[:1000]

"Thomas Jeffrey Hanks (born July 9, 1956) is an American actor and filmmaker. Known for both his comedic and dramatic roles, he is one of the most popular and recognizable film stars worldwide, and is regarded as an American cultural icon. Hanks's films have grossed more than $4.9 billion in North America and more than $9.96 billion worldwide, making him the fourth-highest-grossing actor in North America. He has received numerous honors including the AFI Life Achievement Award in 2002, the Kennedy Center Honor in 2014, the Presidential Medal of Freedom and the French Legion of Honor both in 2016, as well as the Golden Globe Cecil B. DeMille Award in 2020.\nHanks made his breakthrough with leading roles in a series of comedy films that received positive media attention, such as Splash (1984), The Money Pit (1986), Big (1988) and A League of Their Own (1992). He won two consecutive Academy Awards for Best Actor for starring as a gay lawyer suffering from AIDS in Philadelphia (1993) and t

In [11]:
len(page.content)

37877

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    separators=["\n\n", "\n"]
)

chunks = text_splitter.split_text(page.content)
len(chunks)

97

In [12]:
len(chunks[0])

662

In [13]:
labels = ["award", "location", "organization", "person", "movie"]

In [14]:
chunks_entities = []
entity_list = []
duplicates = set()
for text in tqdm(chunks):
    entities = model.predict_entities(text, labels, threshold=0.7)
    entities = merge_entities(entities)
    chunk_entities = set()
    for entity in entities:
        # print(entity["text"], "=>", entity["label"])
        chunk_entities.add(entity["text"])
        if entity["text"] in duplicates:
            continue
        duplicates.add(entity["text"])
        entity_list.append((entity["text"], "=>", entity["label"]))

    chunks_entities.append(list(chunk_entities))

100%|██████████| 97/97 [00:25<00:00,  3.81it/s]


In [15]:
chunks_entities[:2]

[['Thomas Jeffrey Hanks',
  'AFI Life Achievement Award',
  'Golden Globe Cecil B. DeMille Award',
  'North America',
  'French Legion of Honor',
  'Presidential Medal of Freedom',
  'Kennedy Center Honor'],
 ['Robert Zemeckis',
  'Steven Spielberg',
  'Big',
  'The Pacific',
  'Hanks',
  'The Post',
  'Masters of the Air',
  'Philadelphia',
  'Saving Private Ryan',
  'Forrest Gump',
  'Catch Me If You Can',
  'Band of Brothers',
  'The Money Pit',
  'Bridge of Spies',
  'Academy Awards',
  'Ron Howard',
  'The Terminal',
  'Nora Ephron',
  'Splash',
  'A League of Their Own']]

In [16]:
chunks[9]

'\nHaving grown up in the Bay Area, Hanks says that some of his first movie memories were seeing movies in the Alameda Theatre in Alameda, California. Hanks studied theater at Chabot College in Hayward, California, and transferred to California State University, Sacramento after two years. During a 2001 interview with sportscaster Bob Costas, Hanks was asked whether he would rather have an Oscar or a Heisman Trophy. He replied that he would rather win a Heisman by playing halfback for the California Golden Bears. He told New York magazine in 1986, "Acting classes looked like the best place for a guy who liked to make a lot of noise and be rather flamboyant. I spent a lot of time going to plays. I wouldn\'t take dates with me. I\'d just drive to a theater, buy myself a ticket, sit in the seat and read the program, and then get into the play completely. I spent a lot of time like that, seeing Brecht, Tennessee Williams, Ibsen, and all that."'

In [17]:
entity_list[:4]

[('Thomas Jeffrey Hanks', '=>', 'person'),
 ('North America', '=>', 'location'),
 ('AFI Life Achievement Award', '=>', 'award'),
 ('Kennedy Center Honor', '=>', 'award')]

In [18]:
locs = []
orgs = []
persons = []
awards = []
movies = []
for e in entity_list:
    s, p, o = e
    if o == "person":
        persons.append(s.lower())
    elif o == "organization":
        orgs.append(s.lower())
    elif o == "location":
        locs.append(s.lower())
    elif o == "award":
        awards.append(s.lower())
    elif o == "movie":
        movies.append(s.lower())

In [19]:
len(movies)

88

In [20]:
locs

['north america',
 'philadelphia',
 'broadway',
 'concord',
 'california',
 'red bluff',
 'oakland',
 'bay area',
 'alameda',
 'hayward',
 'cleveland',
 'ohio',
 'new york city',
 'los angeles',
 'us',
 'hollywood',
 'wall street',
 'moon',
 'france',
 'u.s.',
 'texas',
 'soviet union',
 'neighborhood',
 'studio 8h',
 'queensland',
 'australia',
 'new orleans',
 'greece',
 'ketchum',
 'idaho',
 'las vegas',
 'schöneck',
 'hesse',
 'germany',
 'united states',
 'kentucky',
 'mati',
 'athens',
 'white house',
 'new york',
 'rock and roll hall of fame',
 'pittsburgh',
 'worldwide',
 'asteroid 12818 tomhanks',
 'world',
 'london',
 'secaucus',
 'new jersey',
 'boston',
 'edina',
 'minnesota']