In [50]:
#import evaluate
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline
import numpy as np
import evaluate
import torch
import mlflow

In [51]:

# Ensure MLflow directory exists
mlruns_dir = '/Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/mlruns'
if not os.path.exists(mlruns_dir):
    os.makedirs(mlruns_dir)

mlruns_trash_dir = '/Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/mlruns/.trash'
if not os.path.exists(mlruns_trash_dir):
    os.makedirs(mlruns_trash_dir)

mlflow.set_tracking_uri('file:///Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/mlruns')

#experiment_id = mlflow.create_experiment('NER_Casestudy_Experiment')

# Create or get the experiment
experiment_name = "NER_Casestudy_Experiment2"
mlflow.set_experiment(experiment_name)

#mlflow.transformers.autolog()

<Experiment: artifact_location='file:///Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/mlruns/871896506229020652', creation_time=1719024964319, experiment_id='871896506229020652', last_update_time=1719024964319, lifecycle_stage='active', name='NER_Casestudy_Experiment2', tags={}>

In [52]:

# Load the dataset with a specified encoding
file_path = '/Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/data/ner_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path, encoding='ISO-8859-1')


data=data.head(1000)

In [53]:

# Drop rows with NaN values
data = data.dropna(subset=['Word','POS','Tag'])


In [54]:

# Group the data by sentences
data['Sentence #'] = data['Sentence #'].ffill()  # Fill forward to propagate sentence IDs
sentences = data.groupby('Sentence #').apply(lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                                                      s['POS'].values.tolist(),
                                                                                      s['Tag'].values.tolist())])
# Convert the groupby object to a list of sentences
sentences = [s for s in sentences]

# Split the dataset into training, validation, and test sets (20% for test)
train_sentences, test_sentences = train_test_split(sentences, test_size=0.20, random_state=42)
train_sentences, val_sentences = train_test_split(train_sentences, test_size=0.25, random_state=42)  # 0.25 * 0.80 = 0.20


In [85]:
train_sentences[0][0:5]

[('Poor', 'JJ', 'O'),
 ('residents', 'NNS', 'O'),
 ('often', 'RB', 'O'),
 ('complain', 'VBP', 'O'),
 ('they', 'PRP', 'O')]

In [55]:

# Convert to Hugging Face Datasets format
def convert_to_dict(sentences):
    words = [[word for word, pos, tag in sentence] for sentence in sentences]
    pos_tags = [[pos for word, pos, tag in sentence] for sentence in sentences]
    ner_tags = [[tag for word, pos, tag in sentence] for sentence in sentences]
    return {"tokens": words, "pos_tags": pos_tags, "ner_tags": ner_tags}

train_data = convert_to_dict(train_sentences)
val_data = convert_to_dict(val_sentences)
test_data = convert_to_dict(test_sentences)

# Create a dataset dictionary
dataset_dict = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'validation': Dataset.from_dict(val_data),
    'test': Dataset.from_dict(test_data)
})



In [86]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 25
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 9
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 9
    })
})

In [56]:
# Define unique tags
unique_tags = list(set(tag for doc in dataset_dict['train']['ner_tags'] for tag in doc))
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [87]:
#tag2id,id2tag

In [58]:
# Tokenizer
model_checkpoint = "dslim/bert-base-NER"
model_checkpoint="nlpso/m3_hierarchical_ner_ref_cmbert_iob2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



tokenizer_config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

In [59]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(tag2id[label[word_idx]] if True else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset_dict.map(tokenize_and_align_labels, batched=True)



Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [92]:
tokenized_inputs = tokenizer(dataset_dict['train']["tokens"], truncation=True, is_split_into_words=True)



In [95]:
tokenized_inputs[1]

Encoding(num_tokens=65, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [90]:
tokenized_datasets['train']

Dataset({
    features: ['tokens', 'pos_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 25
})

In [61]:
# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_tags), ignore_mismatched_sizes=True)
model.classifier = torch.nn.Linear(model.classifier.in_features, len(unique_tags))
model.num_labels = len(unique_tags)

# Metrics
metric = evaluate.load("seqeval")



config.json:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at nlpso/m3_hierarchical_ner_ref_cmbert_iob2 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([29, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([29]) in the checkpoint and torch.Size([11]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



In [63]:

# Training arguments
args = TrainingArguments(
    output_dir='./results',
    #"test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    use_mps_device=True,
    logging_dir='./logs',
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start MLflow run
with mlflow.start_run() as run:
    # Train the model
    trainer.train()





  0%|          | 0/2 [00:00<?, ?it/s]

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.3240535259246826, 'eval_precision': 0.023121387283236993, 'eval_recall': 0.0851063829787234, 'eval_f1': 0.03636363636363636, 'eval_accuracy': 0.18110236220472442, 'eval_runtime': 0.292, 'eval_samples_per_second': 30.826, 'eval_steps_per_second': 3.425, 'epoch': 1.0}
{'train_runtime': 3.5643, 'train_samples_per_second': 7.014, 'train_steps_per_second': 0.561, 'train_loss': 2.420711040496826, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [64]:


#     # Log metrics to MLflow
#     mlflow.log_metrics(results)

    


"""

    # Log the model artifact
    trainer.save_model(os.path.join("results", "model"))
    tokenizer.save_pretrained(os.path.join("results", "model"))

    mlflow.log_artifacts("results/model")


    # Log other artifacts if needed
    # For example, logging training args
    with open("results/training_args.bin", "wb") as f:
        torch.save(args, f)
    mlflow.log_artifact("results/training_args.bin")

"""


'\n\n    # Log the model artifact\n    trainer.save_model(os.path.join("results", "model"))\n    tokenizer.save_pretrained(os.path.join("results", "model"))\n\n    mlflow.log_artifacts("results/model")\n\n\n    # Log other artifacts if needed\n    # For example, logging training args\n    with open("results/training_args.bin", "wb") as f:\n        torch.save(args, f)\n    mlflow.log_artifact("results/training_args.bin")\n\n'

In [65]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Predict with the model
tuned_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple",device=device)
input_example = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge."
# ner_results = ner_pipeline(sample_text)
# print(ner_results)




# Infer the model signature, including a representative input, the expected output, and the parameters that we would like to be able to override at inference time.
signature = mlflow.models.infer_signature(
    ["This is a test!", "And this is also a test."],
    mlflow.transformers.generate_signature_output(
        tuned_pipeline, ["This is a test response!", "So is this."]
    ),
    #params=model_config,
)


In [66]:
#Log the pipeline to the existing training run
with mlflow.start_run(run_id=run.info.run_id):
    model_info = mlflow.transformers.log_model(
        transformers_model=tuned_pipeline,
        #artifact_path="fine_tuned",
        artifact_path="model",
        signature=signature,
        input_example=["Pass in a string", "And have it mark as spam or not."],
        #model_config=model_config,
    )

README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [67]:
# Load our saved model in the native transformers format
loaded = mlflow.transformers.load_model(model_uri=model_info.model_uri)

# Define a test example that we expect to be classified as spam
validation_text = (
    "Want to learn how to make MILLIONS with no effort? Click HERE now! See for yourself! Guaranteed to make you instantly rich! "
    "Don't miss out you could be a winner!"
)

validation_text=("Hello! there")

# validate the performance of our fine-tuning
loaded(validation_text)

2024/06/22 18:46:39 INFO mlflow.transformers: 'runs:/189c00d0f8db4c9c93061fc27564342d/model' resolved as 'file:///Users/lukishyadav/Desktop/engineering/case_studies/ner_casestudy/mlruns/871896506229020652/189c00d0f8db4c9c93061fc27564342d/artifacts/model'


[{'entity': 'LABEL_10',
  'score': 0.114152364,
  'index': 1,
  'word': '▁Hello',
  'start': 0,
  'end': 5},
 {'entity': 'LABEL_9',
  'score': 0.11880725,
  'index': 2,
  'word': '!',
  'start': 5,
  'end': 6},
 {'entity': 'LABEL_9',
  'score': 0.124284334,
  'index': 3,
  'word': '▁the',
  'start': 6,
  'end': 10},
 {'entity': 'LABEL_9',
  'score': 0.12509684,
  'index': 4,
  'word': 're',
  'start': 10,
  'end': 12}]

In [68]:
test_data['tokens']

[['Police',
  'said',
  'the',
  'British',
  'Home',
  'Office',
  'sought',
  'an',
  'investigation',
  'of',
  'Khayam',
  "'s",
  'behavior'],
 ['A',
  'spokesman',
  'for',
  'the',
  'Brotherhood',
  'said',
  'the',
  'arrests',
  'are',
  'an',
  'attempt',
  'to',
  'cut',
  'the',
  'Brotherhood',
  'off',
  'from',
  'its',
  'supporters',
  'and',
  'punishment',
  'for',
  'winning',
  'parliamentary',
  'seats',
  'in',
  'earlier',
  'elections',
  '.'],
 ['The',
  'Muslim',
  'Brotherhood',
  'has',
  'tripled',
  'its',
  'strength',
  'in',
  'parliament',
  'in',
  'recent',
  'elections',
  ',',
  'raising',
  'the',
  'party',
  "'s",
  'total',
  'to',
  '47',
  'seats',
  '.'],
 ['Bedfordshire',
  'police',
  'said',
  'Tuesday',
  'that',
  'Omar',
  'Khayam',
  'was',
  'arrested',
  'in',
  'Bedford',
  'for',
  'breaching',
  'the',
  'conditions',
  'of',
  'his',
  'parole',
  '.'],
 ['The',
  'opposition',
  'has',
  'denounced',
  'the',
  'measure',
  '

In [74]:
# Predict with the model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple",device=device)
ner_pipeline = pipeline("ner", model=model_checkpoint, tokenizer=tokenizer, aggregation_strategy="simple",device=device)
sample_text = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge."
ner_results = ner_pipeline(sample_text)
print(ner_results)



[{'entity_group': 'b_PER+O', 'score': 0.9961592, 'word': 'Hugging', 'start': 0, 'end': 7}, {'entity_group': 'i_PER+O', 'score': 0.85518867, 'word': 'Face Inc. is', 'start': 7, 'end': 20}, {'entity_group': 'O+O', 'score': 0.42458165, 'word': 'a', 'start': 20, 'end': 22}, {'entity_group': 'i_DESC+O', 'score': 0.4448272, 'word': 'company based in New', 'start': 22, 'end': 43}, {'entity_group': 'i_SPAT+i_LOC', 'score': 0.6645448, 'word': 'York City', 'start': 43, 'end': 53}, {'entity_group': 'O+O', 'score': 0.8009725, 'word': '. Its', 'start': 53, 'end': 58}, {'entity_group': 'i_SPAT+i_LOC', 'score': 0.73955727, 'word': 'headquarters are in DUMBO, therefore very close to the Manhattan Bridge', 'start': 58, 'end': 130}, {'entity_group': 'O+O', 'score': 0.9961035, 'word': '.', 'start': 130, 'end': 131}]


In [69]:
# import requests
# import json
# host = 'localhost'
# port = '5000'
# url = f'http://{host}:{port}/invocations'
# headers = {'Content-Type': 'application/json',}
# #http_data = test_df.to_json(orient='split')
# #http_data={'inputs':[list(X_test[0])]}

# http_data={'signature_name':'a','inputs':test_data['tokens']}
# http_data={"inputs": ["Hello"]}

# r = requests.post(url=url, headers=headers, data=json.dumps(http_data))
# print(f'Predictions: {r.text}')

In [70]:
import requests

headers = {
    # Already added when you pass json=
    # 'Content-Type': 'application/json',
}

json_data = {
    'inputs': [
        'Hello',
    ],
}

response = requests.post('http://en.wikipedia.org/curl', headers=headers, json=json_data)
response = requests.post('http://127.0.0.1:5000/invocations', headers=headers, json=json_data)


In [71]:
response.text

'{"predictions": ["LABEL_2"]}'

In [72]:
json.dumps(http_data)

NameError: name 'json' is not defined

In [None]:
#Working Curl requests

curl http://127.0.0.1:5000/invocations -H "Content-Type:application/json"  --data '{"inputs": ["Hello"]}'

'{"inputs": "Hello! there"}'

In [None]:
# import docker
# client = docker.from_env()
# print(client.version())

In [None]:
# import mlflow
# import docker

# # Set Docker client configuration
# client = docker.DockerClient(base_url='unix://var/run/docker.sock')

# # Set the run ID
# run_id = "1d917a3a56d849c9bf2bf48afc287ba4"

# # Build the Docker image
# mlflow.models.build_docker(
#     model_uri=f"runs:/{run_id}/model",
#     name="mlflow_1",
#     enable_mlserver=True,
#     client=client
# )


In [None]:
# import mlflow

# # Set the run ID
# run_id = "1d917a3a56d849c9bf2bf48afc287ba4"

# # Build the Docker image
# mlflow.models.build_docker(
#     model_uri=f"runs:/{run_id}/model",
#     name="mlflow_1",
#     enable_mlserver=True,
# )


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

2024/06/22 16:59:04 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

2024/06/22 16:59:04 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
2024/06/22 16:59:04 INFO mlflow.models.docker_utils: Building docker image with name mlflow_1


DockerException: Error while fetching server API version: Not supported URL scheme http+docker