# Assignment 4

In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from spacy.tokens import Doc
nlp_core = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
import json
from datasets import load_dataset

In [2]:
dataset = load_dataset("imvladikon/english_news_weak_ner", "entities")
print(dataset)

README.md:   0%|          | 0.00/39.4k [00:00<?, ?B/s]

(…)-00000-of-00008-9031817eeadfc5f6.parquet:   0%|          | 0.00/127M [00:00<?, ?B/s]

(…)-00001-of-00008-af757022b3465153.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

(…)-00002-of-00008-1c04f721edf1ff32.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00003-of-00008-df1f9a25e858e6b6.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00004-of-00008-cf4cd100553d61c0.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

(…)-00005-of-00008-d5b0ed5913b0d71b.parquet:   0%|          | 0.00/121M [00:00<?, ?B/s]

(…)-00006-of-00008-ed0ad4a0c990443d.parquet:   0%|          | 0.00/114M [00:00<?, ?B/s]

(…)-00007-of-00008-dac1325602e80645.parquet:   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3515149 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'sent_num', 'sentence', 'doc_title', 'score', 'entity_type', 'entity_text', 'start_char', 'end_char', 'tokens', 'raw_tags', 'ner_tags'],
        num_rows: 3515149
    })
})


In [3]:
# Retrieve the entity label names from the dataset
labels = dataset["train"].features["ner_tags"].feature.names

# Function to convert numerical NER tags into human-readable labels
def decode_ner_tags(example):
    example["ner_labels"] = [labels[tag] for tag in example["ner_tags"]]
    return example

# Apply the function to all data splits
dataset = dataset.map(decode_ner_tags)

Map:   0%|          | 0/3515149 [00:00<?, ? examples/s]

In [7]:
print("Entity Types in the Dataset:", labels)


Entity Types in the Dataset: ['B-DATE', 'I-DATE', 'L-DATE', 'U-DATE', 'B-DUC', 'I-DUC', 'L-DUC', 'U-DUC', 'B-EVE', 'I-EVE', 'L-EVE', 'U-EVE', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC', 'B-MISC', 'I-MISC', 'L-MISC', 'U-MISC', 'B-ORG', 'I-ORG', 'L-ORG', 'U-ORG', 'B-PER', 'I-PER', 'L-PER', 'U-PER', 'B-QTY', 'I-QTY', 'L-QTY', 'U-QTY', 'B-TTL', 'I-TTL', 'L-TTL', 'U-TTL', 'O']


In [4]:
train_dataset = dataset["train"]
df = train_dataset.to_pandas()

In [5]:
# Count unique document IDs
num_documents = df["doc_id"].nunique()
print(f"Number of unique documents: {num_documents}")

Number of unique documents: 309897


In [6]:
import spacy
from spacy.tokens import Doc
from spacy import displacy

# Load a blank spaCy model
nlp = spacy.blank("en")

# Function to visualize Named Entities
def visualize_ner(example):
    tokens = example["tokens"]
    labels = example["ner_labels"]
    start_chars = example["start_char"]
    end_chars = example["end_char"]

    # Convert entity positions into spaCy's expected format
    ents = []
    for start, end, label in zip(start_chars, end_chars, labels):
        if label != "O":  # Ignore "O" (Outside) labels
            ents.append(nlp.make_doc(example["sentence"]).char_span(start, end, label=label))

    # Create a spaCy Doc object
    doc = nlp.make_doc(example["sentence"])

    # Assign named entities (filter out None values)
    doc.ents = [ent for ent in ents if ent is not None]

    # Render visualization in Jupyter Notebook
    displacy.render(doc, style="ent", jupyter=True)

# Display the first example with highlighted named entities
visualize_ner(dataset["train"][1])

In [10]:
# Function to apply the pretrained model and display structured results
def evaluate_ner_predictions(example):
    text = example["sentence"]
    doc = nlp_core(text)  # Use pretrained model
    predictions = [(ent.text, ent.label_) for ent in doc.ents]  # Extract model-predicted entities

    # Compare with dataset labels
    expected_entities = list(zip(example["tokens"], example["ner_labels"]))

    # Convert to DataFrame for better readability
    df_results = pd.DataFrame(predictions, columns=["Text Span", "Model Prediction"])
    df_expected = pd.DataFrame(expected_entities, columns=["Text Span", "Expected Label"])

    # Merge for side-by-side comparison
    results_df = df_results.merge(df_expected, on="Text Span", how="outer").fillna("N/A")

    print(results_df)

# Test on a sample
evaluate_ner_predictions(dataset["train"][1])

             Text Span Model Prediction Expected Label
0                   \n              N/A          I-QTY
1                   \n              N/A              O
2                    $              N/A          B-QTY
3        $1.6\nbillion            MONEY            N/A
4                    (              N/A              O
5                    )              N/A              O
6                    ,              N/A              O
7                    .              N/A              O
8                 1.08              N/A          B-QTY
9   1.08 billion euros            MONEY            N/A
10                1.24              N/A          B-QTY
11        1.24 billion            MONEY            N/A
12                 1.6              N/A          I-QTY
13           Bloomberg              GPE          U-ORG
14                 Net              N/A              O
15                   a              N/A         B-DATE
16                   a              N/A              O
17      a 

In [16]:
# Function to process text and extract entities
def get_entities_from_text(text):
    # Process the text with spaCy model
    doc = nlp(text)
    # Return a list of tuples with entity text and label
    return [(ent.text, ent.label_) for ent in doc.ents]

# Manually annotated sentences
manual_annotations = [
    ("2011-05-04T07:24:07Z -- http://www.bloomberg.com/news/2011-05-04/bbva-is-said-to-announce-falling-profit-on-thursday.html", []),
    ("Net income probably declined to 1.08 billion euros ($1.6 billion) from 1.24 billion euros a year earlier, according to the average estimate in a Bloomberg survey of six analysts.", [("Net income", "MONEY"), ("euros", "MONEY"), ("Bloomberg", "ORG"), ("six analysts", "ORG")]),
    ("The Bilbao, Spain-based lender is scheduled to report first-quarter results tomorrow before the stock market opens in Spain.", [("Bilbao", "GPE"), ("Spain", "GPE")]),
    ("Spanish banks have been hurt by weakening credit demand and souring loans in a domestic economy that’s still struggling to emerge from recession.", [("Spanish banks", "ORG")]),
    ("BBVA, led by Chairman Francisco Gonzalez, completed its $5.8 billion purchase of a 24.9 percent stake in Turkiye Garanti Bankasi AS (GARAN) in March as it expands outside Spain, adding Turkey to other emerging markets such as Mexico.", [("BBVA", "ORG"), ("Francisco Gonzalez", "PERSON"), ("$5.8 billion", "MONEY"), ("Turkiye Garanti Bankasi AS (GARAN)", "ORG"), ("March", "TIME"), ("Spain", "GPE"), ("Turkey", "GPE"), ("Mexico", "GPE")]),
    ("“The news from Spain won’t be good, but there probably won’t be nasty surprises while Mexico is showing interesting growth,” said Pablo Garcia, head of equities at Oddo Sociedad de Valores in Madrid.", [("Spain", "GPE"), ("Mexico", "GPE"), ("Pablo Garcia", "PERSON"), ("Oddo Sociedad de Valores", "ORG"), ("Madrid", "GPE")]),
    ("“All the big Spanish companies, and that includes BBVA, are at pains now to show that Spain is a diminishing part of their business.” BBVA shares have gained 14 percent this year, compared with a 2.9 percent advance in the 48-member Bloomberg Europe Banks and Financial Services Index.", [("Spanish companies", "ORG"), ("BBVA", "ORG"), ("Spain", "GPE"), ("BBVA", "ORG"), ("Bloomberg", "ORG"), ("Europe", "GPE")]),
    ("Banco Santander SA (SAN), Spain’s largest bank, which on April 28 reported a 5 percent drop in first-quarter profit, has risen 5.3 percent in 2011.", [("Banco Santander SA (SAN)", "ORG"), ("Spain", "GPE"), ("April 28", "TIME"), ("2011", "TIME")]),
    ("Mexican Gains Profit from BBVA’s Spain-dominated Iberian business may have dropped 33 percent from a year earlier to 394 million euros, according to estimates by Banco BPI SA (BPI) analyst Carlos Joaquim Peixoto.", [("BBVA", "ORG"), ("Spain", "GPE"), ("Iberian business", "ORG"), ("Banco BPI SA (BPI)", "ORG"), ("Carlos Joaquim Peixoto", "PERSON")]),
    ("Spain and Portugal together accounted for 45 percent of group profit in 2010.", [("Spain", "GPE"), ("Portugal", "GPE"), ("2010", "TIME")])
]

# Retrieve 10 random samples from the dataset
sample_data = dataset["train"].select(range(10))

# Store annotations and predictions
annotations = []

for i, example in enumerate(sample_data):
    text = example["sentence"]
    
    # Get the manual annotations for the text
    manual_annotation = manual_annotations[i][1]
    
    # Get the model's predictions
    model_prediction = get_entities_from_text(text)
    
    # Store both manual annotations and model predictions
    annotations.append({
        "text": text,
        "manual_label": manual_annotation,
        "model_prediction": model_prediction
    })

# Convert to DataFrame for easy viewing
df_manual = pd.DataFrame(annotations)
print(df_manual)

                                                text  \
0  -- \n2011-05-04T07:24:07Z\n\n-- http://www.blo...   
1  Net income probably declined to 1.08 billion e...   
2  The\nBilbao, Spain-based lender is scheduled t...   
3  Spanish banks have been hurt by weakening cred...   
4  BBVA, led by Chairman  Francisco Gonzalez ,\nc...   
5  “The news from Spain won’t be good, but there ...   
6  “All the big Spanish companies, and that\nincl...   
7  Banco Santander SA (SAN) , Spain’s\nlargest ba...   
8  Mexican Gains  Profit from BBVA’s Spain-domina...   
9  Spain and Portugal together accounted for 45\n...   

                                        manual_label  \
0                                                 []   
1  [(Net income, MONEY), (euros, MONEY), (Bloombe...   
2                      [(Bilbao, GPE), (Spain, GPE)]   
3                             [(Spanish banks, ORG)]   
4  [(BBVA, ORG), (Francisco Gonzalez, PERSON), ($...   
5  [(Spain, GPE), (Mexico, GPE), (Pablo Garcia,