In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline
import numpy as np
import evaluate
import torch

# Ensure MLflow directory exists
mlruns_dir = '/Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/mlruns'
if not os.path.exists(mlruns_dir):
    os.makedirs(mlruns_dir)

import mlflow

mlflow.set_tracking_uri('file:///Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/mlruns')

#experiment_id = mlflow.create_experiment('NER_Casestudy_Experiment')

# Create or get the experiment
experiment_name = "NER_Casestudy_Experiment2"
mlflow.set_experiment(experiment_name)


# Load the dataset with a specified encoding
file_path = '/Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/data/ner_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Drop rows with NaN values
data = data.dropna()

# Group the data by sentences
data['Sentence #'] = data['Sentence #'].ffill()  # Fill forward to propagate sentence IDs
sentences = data.groupby('Sentence #').apply(lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                                                      s['POS'].values.tolist(),
                                                                                      s['Tag'].values.tolist())])
# Convert the groupby object to a list of sentences
sentences = [s for s in sentences]

# Split the dataset into training, validation, and test sets (20% for test)
train_sentences, test_sentences = train_test_split(sentences, test_size=0.20, random_state=42)
train_sentences, val_sentences = train_test_split(train_sentences, test_size=0.25, random_state=42)  # 0.25 * 0.80 = 0.20

# Convert to Hugging Face Datasets format
def convert_to_dict(sentences):
    words = [[word for word, pos, tag in sentence] for sentence in sentences]
    pos_tags = [[pos for word, pos, tag in sentence] for sentence in sentences]
    ner_tags = [[tag for word, pos, tag in sentence] for sentence in sentences]
    return {"tokens": words, "pos_tags": pos_tags, "ner_tags": ner_tags}

train_data = convert_to_dict(train_sentences)
val_data = convert_to_dict(val_sentences)
test_data = convert_to_dict(test_sentences)

# Create a dataset dictionary
dataset_dict = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'validation': Dataset.from_dict(val_data),
    'test': Dataset.from_dict(test_data)
})

# Define unique tags
unique_tags = list(set(tag for doc in dataset_dict['train']['ner_tags'] for tag in doc))
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# Tokenizer
model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(tag2id[label[word_idx]] if True else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset_dict.map(tokenize_and_align_labels, batched=True)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_tags), ignore_mismatched_sizes=True)
model.classifier = torch.nn.Linear(model.classifier.in_features, len(unique_tags))
model.num_labels = len(unique_tags)

# Metrics
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training arguments
args = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    use_mps_device=True
)

# Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start MLflow run
with mlflow.start_run() as run:
    # Train the model
    trainer.train()

    # Evaluate the model
    results = trainer.evaluate(tokenized_datasets["test"])

    # Log metrics to MLflow
    mlflow.log_metrics(results)


2024/06/21 23:38:41 INFO mlflow.tracking.fluent: Experiment with name 'NER_Casestudy_Experiment2' does not exist. Creating a new experiment.




Map:   0%|          | 0/28769 [00:00<?, ? examples/s]

Map:   0%|          | 0/9590 [00:00<?, ? examples/s]

Map:   0%|          | 0/9590 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




  0%|          | 0/5397 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3846, 'learning_rate': 1.8147118769686866e-05, 'epoch': 0.28}
{'loss': 0.2865, 'learning_rate': 1.6294237539373727e-05, 'epoch': 0.56}
{'loss': 0.299, 'learning_rate': 1.4441356309060591e-05, 'epoch': 0.83}


  0%|          | 0/600 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.28697168827056885, 'eval_precision': 0.8121546961325967, 'eval_recall': 0.7859135285913529, 'eval_f1': 0.7988186650915534, 'eval_accuracy': 0.91721773205976, 'eval_runtime': 22.7655, 'eval_samples_per_second': 421.252, 'eval_steps_per_second': 26.356, 'epoch': 1.0}
{'loss': 0.2678, 'learning_rate': 1.2588475078747453e-05, 'epoch': 1.11}
{'loss': 0.2359, 'learning_rate': 1.0735593848434316e-05, 'epoch': 1.39}
{'loss': 0.2372, 'learning_rate': 8.882712618121179e-06, 'epoch': 1.67}
{'loss': 0.2322, 'learning_rate': 7.029831387808041e-06, 'epoch': 1.95}


  0%|          | 0/600 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.29561883211135864, 'eval_precision': 0.8181602655286866, 'eval_recall': 0.8021850302185031, 'eval_f1': 0.810093896713615, 'eval_accuracy': 0.9224426483794596, 'eval_runtime': 10.1567, 'eval_samples_per_second': 944.207, 'eval_steps_per_second': 59.074, 'epoch': 2.0}
{'loss': 0.1971, 'learning_rate': 5.176950157494905e-06, 'epoch': 2.22}
{'loss': 0.1837, 'learning_rate': 3.324068927181768e-06, 'epoch': 2.5}
{'loss': 0.1785, 'learning_rate': 1.4711876968686308e-06, 'epoch': 2.78}


  0%|          | 0/600 [00:00<?, ?it/s]

{'eval_loss': 0.2897579073905945, 'eval_precision': 0.8211227402473834, 'eval_recall': 0.802417480241748, 'eval_f1': 0.8116623559840113, 'eval_accuracy': 0.9233406808719079, 'eval_runtime': 10.4589, 'eval_samples_per_second': 916.926, 'eval_steps_per_second': 57.368, 'epoch': 3.0}
{'train_runtime': 724.6803, 'train_samples_per_second': 119.097, 'train_steps_per_second': 7.447, 'train_loss': 0.2458871781704535, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/600 [00:00<?, ?it/s]

[{'entity_group': 'MISC', 'score': 0.9437685, 'word': 'Hu', 'start': 0, 'end': 2}, {'entity_group': 'MISC', 'score': 0.93507373, 'word': '##gging', 'start': 2, 'end': 7}, {'entity_group': 'MISC', 'score': 0.90412134, 'word': 'Face', 'start': 8, 'end': 12}, {'entity_group': 'MISC', 'score': 0.968714, 'word': 'Inc', 'start': 13, 'end': 16}, {'entity_group': 'MISC', 'score': 0.9172592, 'word': '.', 'start': 16, 'end': 17}, {'entity_group': 'PER', 'score': 0.8958597, 'word': 'is a company based in', 'start': 18, 'end': 39}, {'entity_group': 'ORG', 'score': 0.7840915, 'word': 'New York City', 'start': 40, 'end': 53}, {'entity_group': 'PER', 'score': 0.86757696, 'word': '. Its headquarters are in', 'start': 53, 'end': 78}, {'entity_group': 'MISC', 'score': 0.4847457, 'word': 'D', 'start': 79, 'end': 80}, {'entity_group': 'MISC', 'score': 0.84472, 'word': '##UM', 'start': 80, 'end': 82}, {'entity_group': 'MISC', 'score': 0.5472175, 'word': '##BO', 'start': 82, 'end': 84}, {'entity_group': 'PE

In [5]:
# Predict with the model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple",device=device)
sample_text = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge."
ner_results = ner_pipeline(sample_text)
print(ner_results)

[{'entity_group': 'MISC', 'score': 0.9437685, 'word': 'Hu', 'start': 0, 'end': 2}, {'entity_group': 'MISC', 'score': 0.93507373, 'word': '##gging', 'start': 2, 'end': 7}, {'entity_group': 'MISC', 'score': 0.90412134, 'word': 'Face', 'start': 8, 'end': 12}, {'entity_group': 'MISC', 'score': 0.968714, 'word': 'Inc', 'start': 13, 'end': 16}, {'entity_group': 'MISC', 'score': 0.9172592, 'word': '.', 'start': 16, 'end': 17}, {'entity_group': 'PER', 'score': 0.8958597, 'word': 'is a company based in', 'start': 18, 'end': 39}, {'entity_group': 'ORG', 'score': 0.7840915, 'word': 'New York City', 'start': 40, 'end': 53}, {'entity_group': 'PER', 'score': 0.86757696, 'word': '. Its headquarters are in', 'start': 53, 'end': 78}, {'entity_group': 'MISC', 'score': 0.4847457, 'word': 'D', 'start': 79, 'end': 80}, {'entity_group': 'MISC', 'score': 0.84472, 'word': '##UM', 'start': 80, 'end': 82}, {'entity_group': 'MISC', 'score': 0.5472175, 'word': '##BO', 'start': 82, 'end': 84}, {'entity_group': 'PE