# Import Required Libraries
Import the necessary libraries, including spaCy, pandas, and scikit-learn.

In [1]:
# Import Required Libraries

import spacy  # spaCy for NLP tasks
import pandas as pd  # pandas for data manipulation
from sklearn.model_selection import train_test_split  # scikit-learn for splitting data into train and test sets

# Load Dataset
Load a dataset containing text data with named entity annotations.

# Data Preprocessing
Preprocess the text data, including tokenization and annotation formatting.

In [None]:
# Data Preprocessing

# Tokenization and annotation formatting
def preprocess_data(df):
    # Tokenize the text data
    df['Tokens'] = df['Text'].apply(lambda x: [token.text for token in nlp(x)])
    
    # Format annotations
    annotations = []
    for _, row in df.iterrows():
        entities = []
        for start, end, label in zip(row['Start'], row['End'], row['Label']):
            entities.append((start, end, label))
        annotations.append({'entities': entities})
    
    df['Annotations'] = annotations
    return df

# Apply preprocessing to the DataFrame
df = preprocess_data(df)

# Display the first few rows of the preprocessed DataFrame
df.head()

# Train-Test Split
Split the dataset into training and testing sets.

In [None]:
# Train-Test Split

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Display the number of samples in the training and testing sets
print(f"Number of training samples: {len(train_df)}")
print(f"Number of testing samples: {len(test_df)}")

# Display the first few rows of the training DataFrame
train_df.head()

# Display the first few rows of the testing DataFrame
test_df.head()

# Build NER Model
Build and train a Named Entity Recognition model using spaCy or another NLP library.

In [None]:
# Build NER Model

import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding

# Create a blank NER model
ner_model = spacy.blank("en")

# Add the NER pipeline component
ner = ner_model.create_pipe("ner")
ner_model.add_pipe("ner")

# Add labels to the NER component
for _, annotations in train_df['Annotations'].items():
    for ent in annotations['entities']:
        ner.add_label(ent[2])

# Convert the training data to spaCy's Example format
train_data = []
for text, annotations in zip(train_df['Text'], train_df['Annotations']):
    doc = ner_model.make_doc(text)
    example = Example.from_dict(doc, {"entities": annotations['entities']})
    train_data.append(example)

# Train the NER model
optimizer = ner_model.begin_training()
for i in range(10):  # Number of training iterations
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        ner_model.update(batch, drop=0.5, losses=losses)
    print(f"Iteration {i+1}, Losses: {losses}")

# Save the trained model to disk
ner_model.to_disk("ner_model")

# Load the trained model
trained_ner_model = spacy.load("ner_model")

# Test the trained model on a sample text
sample_text = test_df['Text'].iloc[0]
doc = trained_ner_model(sample_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

# Evaluate Model
Evaluate the performance of the NER model using appropriate metrics such as precision, recall, and F1-score.

In [None]:
# Evaluate Model

from sklearn.metrics import classification_report

# Function to convert spaCy doc to a list of tuples (start, end, label)
def get_entities(doc):
    return [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

# Evaluate the model on the test set
true_entities = []
pred_entities = []

for text, annotations in zip(test_df['Text'], test_df['Annotations']):
    doc = trained_ner_model(text)
    true_entities.append(annotations['entities'])
    pred_entities.append(get_entities(doc))

# Flatten the lists
true_entities_flat = [item for sublist in true_entities for item in sublist]
pred_entities_flat = [item for sublist in pred_entities for item in sublist]

# Extract the labels
true_labels = [label for _, _, label in true_entities_flat]
pred_labels = [label for _, _, label in pred_entities_flat]

# Generate classification report
report = classification_report(true_labels, pred_labels, zero_division=0)
print(report)

# Visualize Results
Visualize the results of the NER model on sample text data.

In [None]:
# Visualize Results

import matplotlib.pyplot as plt
from spacy import displacy

# Function to visualize named entities in text
def visualize_ner(text, model):
    doc = model(text)
    displacy.render(doc, style="ent", jupyter=True)

# Visualize the results of the NER model on sample text data
sample_texts = test_df['Text'].head(5).tolist()

for text in sample_texts:
    visualize_ner(text, trained_ner_model)