<a href="https://colab.research.google.com/github/nainakader/notebooks/blob/master/BERT_TESSERACT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
import pandas as pd
from spacy.matcher import Matcher

# Load the SpaCy model
nlp = spacy.load('en_core_web_sm')

# Initialize the Matcher
matcher = Matcher(nlp.vocab)

# Define patterns for CIVIL NUMBER and DATE OF BIRTH
pattern_civil_number = [{'LOWER': 'civil'}, {'LOWER': 'number'}, {'IS_DIGIT': True}]
pattern_dob = [{'LOWER': 'date'}, {'LOWER': 'of'}, {'LOWER': 'birth'}, {'IS_DIGIT': True}, {'IS_PUNCT': True}, {'IS_DIGIT': True}, {'IS_PUNCT': True}, {'IS_DIGIT': True}]

# Add patterns to the matcher
matcher.add('CIVIL_NUMBER', [pattern_civil_number])
matcher.add('DOB', [pattern_dob])

# Sample text
sample_text = """
8? fit ft eke \
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE-GF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

# Function to extract information
def extract_info(text):
    doc = nlp(text)
    matches = matcher(doc)

    civil_number = 'Not Found'
    date_of_birth = 'Not Found'

    for match_id, start, end in matches:
        span = doc[start:end]
        if doc.vocab.strings[match_id] == 'CIVIL_NUMBER':
            civil_number = span[-1].text  # Last token is the number
        elif doc.vocab.strings[match_id] == 'DOB':
            date_of_birth = span[-3:].text_with_ws.strip()  # Last three tokens for date

    return civil_number, date_of_birth

# Generate dataset
data = []
for _ in range(20000):
    civil_number, date_of_birth = extract_info(sample_text)
    data.append({'Sample Text': sample_text, 'Civil Number': civil_number, 'Date of Birth': date_of_birth})

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('extracted_data.csv', index=False)

print("Dataset generated and saved to 'extracted_data.csv'.")


Dataset generated and saved to 'extracted_data.csv'.


In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.23.0-

In [1]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
# Install the required libraries
#!pip install transformers datasets

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load the dataset
file_path = './resident_card_data_samples_with_junk.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Function to label the text
def label_text(row):
    text = row['Sample Data']
    civil_number = str(row['Civil Number'])
    date_of_birth = str(row['Date of Birth'])

    civil_number_start = text.find(civil_number)
    date_of_birth_start = text.find(date_of_birth)

    labels = ['O'] * len(text)

    if civil_number_start != -1:
        for i in range(civil_number_start, civil_number_start + len(civil_number)):
            if i == civil_number_start:
                labels[i] = 'B-CIVIL_NUMBER'
            else:
                labels[i] = 'I-CIVIL_NUMBER'

    if date_of_birth_start != -1:
        for i in range(date_of_birth_start, date_of_birth_start + len(date_of_birth)):
            if i == date_of_birth_start:
                labels[i] = 'B-DATE_OF_BIRTH'
            else:
                labels[i] = 'I-DATE_OF_BIRTH'

    return labels

# Apply the labeling function
df['labels'] = df.apply(label_text, axis=1)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=5)  # Adjust num_labels as per your requirement

# Define label mapping
label_list = ['O', 'B-CIVIL_NUMBER', 'I-CIVIL_NUMBER', 'B-DATE_OF_BIRTH', 'I-DATE_OF_BIRTH']
label_map = {label: i for i, label in enumerate(label_list)}

# Tokenize the dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['Sample Data'], truncation=True, padding='max_length', max_length=512)
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])
            else:
                label_ids.append(label_map[label[word_idx]] if label[word_idx] != 'O' else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Map the tokenization function to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split the dataset into train and test
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
dataset_dict = DatasetDict({
    'train': split_dataset['train'],
    'test': split_dataset['test']
})

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,0.011,0.008559


Epoch,Training Loss,Validation Loss
1,0.011,0.008559
2,0.0055,0.005839


In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the trained model and tokenizer
model_path = './trained_model'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Function to get entities from text
def get_entities(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [label_list[p] for p in predictions[0].numpy()]

    entities = {"CIVIL_NUMBER": [], "DATE_OF_BIRTH": []}
    current_entity = None
    current_tokens = []

    for token, label in zip(tokens, labels):
        if label.startswith("B-"):
            if current_entity:
                entities[current_entity].append(" ".join(current_tokens))
            current_entity = label.split("-")[1]
            current_tokens = [token]
        elif label.startswith("I-") and current_entity == label.split("-")[1]:
            current_tokens.append(token)
        else:
            if current_entity:
                entities[current_entity].append(" ".join(current_tokens))
                current_entity = None
            current_tokens = []

    # Add last entity if exists
    if current_entity:
        entities[current_entity].append(" ".join(current_tokens))

    return entities

# Example usage
text = """
8? fit ft eke \\
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE-GF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

entities = get_entities(text)
print("Extracted Entities:", entities)


Extracted Entities: {'CIVIL_NUMBER': ['##SA', '##E', '##Z', '##Z', 'nation', '##aut', '##y PA ##K ##IS ##TA ##NI - ID ##OM ##N ##7 ##33 ##0 ##38 ##48 < 0 < <'], 'DATE_OF_BIRTH': []}


In [19]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=33ae64b84a8a9366c6e762781bfc2fc22330a9961976fd477b79c7351a9a6244
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [31]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding='max_length',  # Add padding
        max_length=512,        # Define a maximum length
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs


In [32]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report

# Load CSV
df = pd.read_csv('./resident_card_data_samples_with_junk.csv')

# Ensure columns are treated as strings
df['Sample Data'] = df['Sample Data'].astype(str)

# Tokenize the 'Sample Data' column
df['tokens'] = df['Sample Data'].apply(lambda x: x.split())

# Create NER tags column with placeholder 'O' tags (Modify based on your tagging needs)
df['ner_tags'] = df['tokens'].apply(lambda x: ['O'] * len(x))

# Define label list and label mapping
label_list = ['O', 'B-CIVIL_NUMBER', 'I-CIVIL_NUMBER', 'B-DATE_OF_BIRTH', 'I-DATE_OF_BIRTH']
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'  # Replace with your pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding='max_length',  # Add padding
        max_length=512,        # Define a maximum length
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['Sample Data', 'Date of Birth', 'Civil Number'])

# Define compute metrics function
import numpy as np
import torch

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions)  # Convert to tensor
    predictions = torch.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_list[p] for p, l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    results = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        'precision': results['micro avg']['precision'],
        'recall': results['micro avg']['recall'],
        'f1': results['micro avg']['f1-score'],
        'accuracy': results['accuracy'],
    }


# Split dataset into train and eval
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss


TypeError: argmax(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [35]:
import numpy as np
import torch
from seqeval.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions)  # Convert to tensor
    predictions = torch.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_list[p] for p, l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    results = classification_report(true_labels, true_predictions, output_dict=True)

    # Default metrics to handle empty results
    precision = results.get('micro avg', {}).get('precision', 0.0)
    recall = results.get('micro avg', {}).get('recall', 0.0)
    f1 = results.get('micro avg', {}).get('f1-score', 0.0)
    accuracy = results.get('accuracy', 0.0)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy,
    }


In [50]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from seqeval.metrics import classification_report

# Load CSV
df = pd.read_csv('./resident_card_data_full.csv')

# Ensure columns are treated as strings
df['Sample Data'] = df['Sample Data'].astype(str)

# Tokenize the 'Sample Data' column
df['tokens'] = df['Sample Data'].apply(lambda x: x.split())

# Create NER tags column with placeholder 'O' tags (Modify based on your tagging needs)
df['ner_tags'] = df['tokens'].apply(lambda x: ['O'] * len(x))

# Define label list and label mapping
label_list = ['O', 'B-CIVIL_NUMBER', 'I-CIVIL_NUMBER', 'B-DATE_OF_BIRTH', 'I-DATE_OF_BIRTH']
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'  # Replace with your pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding='max_length',  # Add padding
        max_length=512,        # Define a maximum length
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['Sample Data', 'Date of Birth', 'Civil Number'])

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions)  # Convert to tensor
    predictions = torch.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_list[p] for p, l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    results = classification_report(true_labels, true_predictions, output_dict=True)

    # Default metrics to handle empty results
    precision = results.get('micro avg', {}).get('precision', 0.0)
    recall = results.get('micro avg', {}).get('recall', 0.0)
    f1 = results.get('micro avg', {}).get('f1-score', 0.0)
    accuracy = results.get('accuracy', 0.0)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy,
    }

# Split dataset into train and eval
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0001,4e-05,0.0,0.0,0.0,0.0
2,0.0,2.5e-05,0.0,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


KeyboardInterrupt: 

In [38]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load the fine-tuned model and tokenizer
model_path = './trained_model'  # Path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Initialize the pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text for inference
example_text = """
8? fit ft eke \\
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE-GF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

# Tokenize and get predictions
tokens = tokenizer(example_text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**tokens)

# Get predictions
predictions = torch.argmax(outputs.logits, dim=2)

# Decode predictions
label_list = model.config.id2label
predicted_labels = [label_list[pred.item()] for pred in predictions[0]]

# Display results
print("Token\t\tPrediction")
for token, label in zip(tokens['input_ids'][0], predicted_labels):
    print(f"{tokenizer.decode([token])}\t\t{label}")

# Alternatively, using the pipeline directly
result = nlp(example_text)
for entity in result:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")


Token		Prediction
[CLS]		LABEL_0
8		LABEL_0
?		LABEL_0
fit		LABEL_0
ft		LABEL_0
e		LABEL_0
##ke		LABEL_0
\		LABEL_0
S		LABEL_0
##U		LABEL_0
##LT		LABEL_0
##AN		LABEL_0
##AT		LABEL_0
##E		LABEL_0
OF		LABEL_0
O		LABEL_0
##MA		LABEL_0
##N		LABEL_0
R		LABEL_0
##ES		LABEL_0
##ID		LABEL_0
##EN		LABEL_0
##T		LABEL_0
CA		LABEL_0
##RD		LABEL_0
88		LABEL_0
##C		LABEL_0
##00		LABEL_0
##1		LABEL_0
##C		LABEL_0
##31		LABEL_0
##0		LABEL_0
##3		LABEL_0
124		LABEL_0
##7		LABEL_0
##8		LABEL_0
C		LABEL_0
##I		LABEL_0
##VI		LABEL_0
##L		LABEL_0
N		LABEL_0
##UM		LABEL_0
##BE		LABEL_0
##R		LABEL_0
o		LABEL_0
##ye		LABEL_0
e		LABEL_0
##e		LABEL_0
Kyle		LABEL_0
:		LABEL_0
73		LABEL_0
##30		LABEL_0
##38		LABEL_0
##48		LABEL_0
o		LABEL_0
##lla		LABEL_0
o		LABEL_0
##e		LABEL_0
E		LABEL_0
##E		LABEL_0
E		LABEL_0
##X		LABEL_0
##PI		LABEL_0
##R		LABEL_0
##Y		LABEL_0
.		LABEL_0
D		LABEL_0
##AT		LABEL_0
##E		LABEL_0
06		LABEL_0
/		LABEL_0
05		LABEL_0
/		LABEL_0
2004		LABEL_0
Ha		LABEL_0
##w		LABEL_0
g		LABEL_0
##at	

In [39]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load the fine-tuned model and tokenizer
model_path = './trained_model'  # Path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Initialize the pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text for inference
example_text = """
8? fit ft eke \\
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE-GF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

# Tokenize and get predictions
tokens = tokenizer(example_text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**tokens)

# Get predictions
predictions = torch.argmax(outputs.logits, dim=2)

# Decode predictions
label_list = model.config.id2label
predicted_labels = [label_list[pred.item()] for pred in predictions[0]]

# Display results
print("Token\t\tPrediction")
for token, label in zip(tokens['input_ids'][0], predicted_labels):
    print(f"{tokenizer.decode([token])}\t\t{label}")

# Alternatively, using the pipeline directly
result = nlp(example_text)
for entity in result:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")


Token		Prediction
[CLS]		LABEL_0
8		LABEL_0
?		LABEL_0
fit		LABEL_0
ft		LABEL_0
e		LABEL_0
##ke		LABEL_0
\		LABEL_0
S		LABEL_0
##U		LABEL_0
##LT		LABEL_0
##AN		LABEL_0
##AT		LABEL_0
##E		LABEL_0
OF		LABEL_0
O		LABEL_0
##MA		LABEL_0
##N		LABEL_0
R		LABEL_0
##ES		LABEL_0
##ID		LABEL_0
##EN		LABEL_0
##T		LABEL_0
CA		LABEL_0
##RD		LABEL_0
88		LABEL_0
##C		LABEL_0
##00		LABEL_0
##1		LABEL_0
##C		LABEL_0
##31		LABEL_0
##0		LABEL_0
##3		LABEL_0
124		LABEL_0
##7		LABEL_0
##8		LABEL_0
C		LABEL_0
##I		LABEL_0
##VI		LABEL_0
##L		LABEL_0
N		LABEL_0
##UM		LABEL_0
##BE		LABEL_0
##R		LABEL_0
o		LABEL_0
##ye		LABEL_0
e		LABEL_0
##e		LABEL_0
Kyle		LABEL_0
:		LABEL_0
73		LABEL_0
##30		LABEL_0
##38		LABEL_0
##48		LABEL_0
o		LABEL_0
##lla		LABEL_0
o		LABEL_0
##e		LABEL_0
E		LABEL_0
##E		LABEL_0
E		LABEL_0
##X		LABEL_0
##PI		LABEL_0
##R		LABEL_0
##Y		LABEL_0
.		LABEL_0
D		LABEL_0
##AT		LABEL_0
##E		LABEL_0
06		LABEL_0
/		LABEL_0
05		LABEL_0
/		LABEL_0
2004		LABEL_0
Ha		LABEL_0
##w		LABEL_0
g		LABEL_0
##at	

In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the fine-tuned model and tokenizer
#model_path = './results'  # Update this with your model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Initialize the pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text for inference
example_text = """
8? fit ft eke \\
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE-GF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

# Tokenize the text
tokens = tokenizer(example_text, return_tensors="pt", truncation=True, padding=True)

# Get model predictions
with torch.no_grad():
    outputs = model(**tokens)
    predictions = torch.argmax(outputs.logits, dim=2)

# Convert predictions to labels
label_list = model.config.id2label
predicted_labels = [label_list[pred.item()] for pred in predictions[0]]

# Display the tokens and their predicted labels
print("Token\t\tPrediction")
for token, label in zip(tokens['input_ids'][0], predicted_labels):
    print(f"{tokenizer.decode([token]).strip()}\t\t{label}")

# Alternatively, use the pipeline for entity extraction
result = nlp(example_text)
print("\nExtracted Entities:")
for entity in result:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")


Token		Prediction
[CLS]		LABEL_0
8		LABEL_0
?		LABEL_0
fit		LABEL_0
ft		LABEL_0
e		LABEL_0
##ke		LABEL_0
\		LABEL_0
S		LABEL_0
##U		LABEL_0
##LT		LABEL_0
##AN		LABEL_0
##AT		LABEL_0
##E		LABEL_0
OF		LABEL_0
O		LABEL_0
##MA		LABEL_0
##N		LABEL_0
R		LABEL_0
##ES		LABEL_0
##ID		LABEL_0
##EN		LABEL_0
##T		LABEL_0
CA		LABEL_0
##RD		LABEL_0
88		LABEL_0
##C		LABEL_0
##00		LABEL_0
##1		LABEL_0
##C		LABEL_0
##31		LABEL_0
##0		LABEL_0
##3		LABEL_0
124		LABEL_0
##7		LABEL_0
##8		LABEL_0
C		LABEL_0
##I		LABEL_0
##VI		LABEL_0
##L		LABEL_0
N		LABEL_0
##UM		LABEL_0
##BE		LABEL_0
##R		LABEL_0
o		LABEL_0
##ye		LABEL_0
e		LABEL_0
##e		LABEL_0
Kyle		LABEL_0
:		LABEL_0
73		LABEL_0
##30		LABEL_0
##38		LABEL_0
##48		LABEL_0
o		LABEL_0
##lla		LABEL_0
o		LABEL_0
##e		LABEL_0
E		LABEL_0
##E		LABEL_0
E		LABEL_0
##X		LABEL_0
##PI		LABEL_0
##R		LABEL_0
##Y		LABEL_0
.		LABEL_0
D		LABEL_0
##AT		LABEL_0
##E		LABEL_0
06		LABEL_0
/		LABEL_0
05		LABEL_0
/		LABEL_0
2004		LABEL_0
Ha		LABEL_0
##w		LABEL_0
g		LABEL_0
##at	

In [41]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the fine-tuned model and tokenizer
#model_path = './results'  # Update this with your model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Initialize the pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text for inference
example_text = """
8? fit ft eke \\
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE-GF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

# Tokenize the text
tokens = tokenizer(example_text, return_tensors="pt", truncation=True, padding=True)

# Get model predictions
with torch.no_grad():
    outputs = model(**tokens)
    predictions = torch.argmax(outputs.logits, dim=2)

# Convert predictions to labels
label_list = model.config.id2label
predicted_labels = [label_list[pred.item()] for pred in predictions[0]]

# Display the tokens and their predicted labels
print("Token\t\tPrediction")
for token, label in zip(tokens['input_ids'][0], predicted_labels):
    print(f"{tokenizer.decode([token]).strip()}\t\t{label}")

# Alternatively, use the pipeline for entity extraction
result = nlp(example_text)
print("\nExtracted Entities:")
for entity in result:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")


Token		Prediction
[CLS]		LABEL_0
8		LABEL_0
?		LABEL_0
fit		LABEL_0
ft		LABEL_0
e		LABEL_0
##ke		LABEL_0
\		LABEL_0
S		LABEL_0
##U		LABEL_0
##LT		LABEL_0
##AN		LABEL_0
##AT		LABEL_0
##E		LABEL_0
OF		LABEL_0
O		LABEL_0
##MA		LABEL_0
##N		LABEL_0
R		LABEL_0
##ES		LABEL_0
##ID		LABEL_0
##EN		LABEL_0
##T		LABEL_0
CA		LABEL_0
##RD		LABEL_0
88		LABEL_0
##C		LABEL_0
##00		LABEL_0
##1		LABEL_0
##C		LABEL_0
##31		LABEL_0
##0		LABEL_0
##3		LABEL_0
124		LABEL_0
##7		LABEL_0
##8		LABEL_0
C		LABEL_0
##I		LABEL_0
##VI		LABEL_0
##L		LABEL_0
N		LABEL_0
##UM		LABEL_0
##BE		LABEL_0
##R		LABEL_0
o		LABEL_0
##ye		LABEL_0
e		LABEL_0
##e		LABEL_0
Kyle		LABEL_0
:		LABEL_0
73		LABEL_0
##30		LABEL_0
##38		LABEL_0
##48		LABEL_0
o		LABEL_0
##lla		LABEL_0
o		LABEL_0
##e		LABEL_0
E		LABEL_0
##E		LABEL_0
E		LABEL_0
##X		LABEL_0
##PI		LABEL_0
##R		LABEL_0
##Y		LABEL_0
.		LABEL_0
D		LABEL_0
##AT		LABEL_0
##E		LABEL_0
06		LABEL_0
/		LABEL_0
05		LABEL_0
/		LABEL_0
2004		LABEL_0
Ha		LABEL_0
##w		LABEL_0
g		LABEL_0
##at	

In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the fine-tuned model and tokenizer
model_path = './trained_model'  # Update this with your model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Initialize the pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text for inference
example_text = """
8? fit ft eke \\
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE-GF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

# Tokenize the text
inputs = tokenizer(example_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

# Convert predictions to labels
label_list = model.config.id2label
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = [label_list[pred.item()] for pred in predictions[0]]

# Function to merge tokens
def merge_tokens(tokens, labels):
    words, merged_labels = [], []
    current_word, current_label = '', ''

    for token, label in zip(tokens, labels):
        if token.startswith('##'):
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
                merged_labels.append(current_label)
            current_word = token
            current_label = label

    if current_word:  # Append the last token
        words.append(current_word)
        merged_labels.append(current_label)

    return words, merged_labels

# Merge tokens and labels
words, labels = merge_tokens(tokens, predicted_labels)

# Display the tokens and their predicted labels
print("Token\t\tPrediction")
for token, label in zip(tokens, predicted_labels):
    if token not in tokenizer.all_special_tokens:
        print(f"{token}\t\t{label}")

# Display the merged entities
print("\nMerged Entities:")
for word, label in zip(words, labels):
    print(f"Word: {word}, Label: {label}")

# Extract entities using the pipeline
result = nlp(example_text)
print("\nExtracted Entities:")
for entity in result:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")

# Extract specific entities
civil_number = []
date_of_birth = []

for word, label in zip(words, labels):
    if 'CIVIL_NUMBER' in label:
        civil_number.append(word)
    elif 'DATE_OF_BIRTH' in label:
        date_of_birth.append(word)

# Print extracted CIVIL_NUMBER and DATE_OF_BIRTH
print("\nExtracted CIVIL_NUMBER:")
print(" ".join(civil_number))

print("\nExtracted DATE_OF_BIRTH:")
print(" ".join(date_of_birth))


Token		Prediction
8		LABEL_0
?		LABEL_0
fit		LABEL_0
ft		LABEL_0
e		LABEL_0
##ke		LABEL_0
\		LABEL_0
S		LABEL_0
##U		LABEL_0
##LT		LABEL_0
##AN		LABEL_0
##AT		LABEL_0
##E		LABEL_0
OF		LABEL_0
O		LABEL_0
##MA		LABEL_0
##N		LABEL_0
R		LABEL_0
##ES		LABEL_0
##ID		LABEL_0
##EN		LABEL_0
##T		LABEL_0
CA		LABEL_0
##RD		LABEL_0
88		LABEL_0
##C		LABEL_0
##00		LABEL_0
##1		LABEL_0
##C		LABEL_0
##31		LABEL_0
##0		LABEL_0
##3		LABEL_0
124		LABEL_0
##7		LABEL_0
##8		LABEL_0
C		LABEL_0
##I		LABEL_0
##VI		LABEL_0
##L		LABEL_0
N		LABEL_0
##UM		LABEL_0
##BE		LABEL_0
##R		LABEL_0
o		LABEL_0
##ye		LABEL_0
e		LABEL_0
##e		LABEL_0
Kyle		LABEL_0
:		LABEL_0
73		LABEL_0
##30		LABEL_0
##38		LABEL_0
##48		LABEL_0
o		LABEL_0
##lla		LABEL_0
o		LABEL_0
##e		LABEL_0
E		LABEL_0
##E		LABEL_0
E		LABEL_0
##X		LABEL_0
##PI		LABEL_0
##R		LABEL_0
##Y		LABEL_0
.		LABEL_0
D		LABEL_0
##AT		LABEL_0
##E		LABEL_0
06		LABEL_0
/		LABEL_0
05		LABEL_0
/		LABEL_0
2004		LABEL_0
Ha		LABEL_0
##w		LABEL_0
g		LABEL_0
##at		LABEL_0
sign		

In [48]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the fine-tuned model and tokenizer
#model_path = './results'  # Update this with your model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Initialize the pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text for inference
example_text = """
8? fit ft eke \\
SULTANATE OF OMAN

RESIDENT
CARD

88C001C3103 12478

CIVIL NUMBER oye
ee Kyle : 73303848 olla
oe EE EXPIRY. DATE 06/05/2004 Haw gat
signavune yx DATE OF BIRTH 01/05/1982 shyt
' ~ . “Ae
hen SLU GIS
HE Sno shar kat OI yal Yl
Osh OU ope gi gl
steel yy a eagle Spite Oya at

VEHICLE ORIVING LICENCE

ROYAL OMAR POLICE CLASS
D.C. OF CLVIL STATUS

NOTE
ZLETSS Name IMRAN SAJID HUSSAIN SABIR

ja Pen Y ttt
SSAEZZ” nationauty PAKISTANI -

IDOMN73303848<0<<<<<<K<cKcccsee<
8205122M2405067PAK<<<<<<<<Kccc2
IMRAN<SAJ ID<HUSSAINK<<<<<<<ccc<
"""

# Tokenize the text
inputs = tokenizer(example_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

# Convert predictions to labels
label_list = model.config.id2label
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = [label_list[pred.item()] for pred in predictions[0]]

# Function to merge tokens
def merge_tokens(tokens, labels):
    words, merged_labels = [], []
    current_word, current_label = '', ''

    for token, label in zip(tokens, labels):
        if token.startswith('##'):
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
                merged_labels.append(current_label)
            current_word = token
            current_label = label

    if current_word:  # Append the last token
        words.append(current_word)
        merged_labels.append(current_label)

    return words, merged_labels

# Merge tokens and labels
words, labels = merge_tokens(tokens, predicted_labels)

# Display the tokens and their predicted labels
print("Token\t\tPrediction")
for token, label in zip(tokens, predicted_labels):
    if token not in tokenizer.all_special_tokens:
        print(f"{token}\t\t{label}")

# Display the merged entities
print("\nMerged Entities:")
for word, label in zip(words, labels):
    print(f"Word: {word}, Label: {label}")

# Extract entities using the pipeline
result = nlp(example_text)
print("\nExtracted Entities:")
for entity in result:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")

# Function to clean entity tokens
def clean_entity(tokens):
    return " ".join(tokens).replace(" ##", "")

# Extract specific entities
civil_number = []
date_of_birth = []

for word, label in zip(words, labels):
    if 'CIVIL_NUMBER' in label:
        civil_number.append(word)
    elif 'DATE_OF_BIRTH' in label:
        date_of_birth.append(word)

# Clean and print extracted CIVIL_NUMBER and DATE_OF_BIRTH
cleaned_civil_number = clean_entity(civil_number)
cleaned_date_of_birth = clean_entity(date_of_birth)

print("\nExtracted CIVIL_NUMBER:")
print(cleaned_civil_number)

print("\nExtracted DATE_OF_BIRTH:")
print(cleaned_date_of_birth)
#

Token		Prediction
8		LABEL_0
?		LABEL_0
fit		LABEL_0
ft		LABEL_0
e		LABEL_0
##ke		LABEL_0
\		LABEL_0
S		LABEL_0
##U		LABEL_0
##LT		LABEL_0
##AN		LABEL_0
##AT		LABEL_0
##E		LABEL_0
OF		LABEL_0
O		LABEL_0
##MA		LABEL_0
##N		LABEL_0
R		LABEL_0
##ES		LABEL_0
##ID		LABEL_0
##EN		LABEL_0
##T		LABEL_0
CA		LABEL_0
##RD		LABEL_0
88		LABEL_0
##C		LABEL_0
##00		LABEL_0
##1		LABEL_0
##C		LABEL_0
##31		LABEL_0
##0		LABEL_0
##3		LABEL_0
124		LABEL_0
##7		LABEL_0
##8		LABEL_0
C		LABEL_0
##I		LABEL_0
##VI		LABEL_0
##L		LABEL_0
N		LABEL_0
##UM		LABEL_0
##BE		LABEL_0
##R		LABEL_0
o		LABEL_0
##ye		LABEL_0
e		LABEL_0
##e		LABEL_0
Kyle		LABEL_0
:		LABEL_0
73		LABEL_0
##30		LABEL_0
##38		LABEL_0
##48		LABEL_0
o		LABEL_0
##lla		LABEL_0
o		LABEL_0
##e		LABEL_0
E		LABEL_0
##E		LABEL_0
E		LABEL_0
##X		LABEL_0
##PI		LABEL_0
##R		LABEL_1
##Y		LABEL_0
.		LABEL_0
D		LABEL_0
##AT		LABEL_0
##E		LABEL_0
06		LABEL_0
/		LABEL_0
05		LABEL_0
/		LABEL_0
2004		LABEL_0
Ha		LABEL_0
##w		LABEL_0
g		LABEL_0
##at		LABEL_0
sign		

In [43]:
df

Unnamed: 0,Sample Data,Date of Birth,Civil Number,tokens,ner_tags
0,XDpsFRDjkr \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,26/07/1985,83814105,"[XDpsFRDjkr, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,bKURhFJnWI \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,04/03/1951,79787460,"[bKURhFJnWI, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,hOJNkhdSCj \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,07/01/1962,94486471,"[hOJNkhdSCj, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,zGyBHwCFoi \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,04/05/1953,92850968,"[zGyBHwCFoi, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,jJJMpNUhWo \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,20/12/1973,92951996,"[jJJMpNUhWo, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...
19995,noXlvfOroG \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,07/07/1996,13930207,"[noXlvfOroG, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
19996,f zKgKlEWi \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,05/02/1963,432697,"[f, zKgKlEWi, \, SULTANATE, OF, OMAN, RESIDENT...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
19997,iYfvVNMdho \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,24/01/1989,1219671,"[iYfvVNMdho, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
19998,EqzxJjIEdg \\nSULTANATE OF OMAN\n\nRESIDENT\nC...,21/02/1964,45463890,"[EqzxJjIEdg, \, SULTANATE, OF, OMAN, RESIDENT,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [51]:
import pandas as pd

# Load data
df = pd.read_csv('resident_card_data_full.csv')

# Inspect data
print(df.head())


                                         Sample Data Date of Birth  \
0  8? fit ft eke \\nSULTANATE OF OMAN\n\nRESIDENT...    14/02/1968   
1  8? fit ft eke \\nSULTANATE OF OMAN\n\nRESIDENT...    06/08/2000   
2  8? fit ft eke \\nSULTANATE OF OMAN\n\nRESIDENT...    28/09/1992   
3  8? fit ft eke \\nSULTANATE OF OMAN\n\nRESIDENT...    12/09/1977   
4  8? fit ft eke \\nSULTANATE OF OMAN\n\nRESIDENT...    21/08/1991   

   Civil Number  
0      53134749  
1      34770676  
2      39148038  
3      39699161  
4      70722907  


In [52]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize sample_data
df['input_ids'] = df['Sample Data'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))

# Create target sequences
df['target'] = df.apply(lambda x: f"DOB: {x['Date of Birth']} CIVIL: {x['Civil Number']}", axis=1)

# Tokenize target sequences
df['target_ids'] = df['target'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=50, truncation=True))

# Split the data
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)




In [53]:
from transformers import EncoderDecoderModel

# Load a pre-trained BERT model
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')

# Set special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Set training parameters
model.config.vocab_size = model.config.encoder.vocab_size
model.config.max_length = 50
model.config.no_repeat_ngram_size = 2
model.config.early_stopping = True


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [103]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load and inspect data
df = pd.read_csv('corrected_complex_sample_data_20000_samples.csv')
print(df.head())

# Ensure there are no missing values
df.dropna(inplace=True)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize sample_data with padding and truncation
df['input_ids'] = df['sample_data'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True, padding='max_length'))

# Create target sequences
df['target'] = df.apply(lambda x: f"DOB: {x['CIVIL NUMBER']} CIVIL: {x['CIVIL NUMBER']}", axis=1)

# Tokenize target sequences with padding and truncation
df['target_ids'] = df['target'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=50, truncation=True, padding='max_length'))

# Convert to dataset format suitable for Seq2SeqTrainer
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Convert to torch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.input_ids = data['input_ids'].tolist()
        self.target_ids = data['target_ids'].tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'labels': torch.tensor(self.target_ids[idx], dtype=torch.long)
        }

train_dataset = CustomDataset(train_data)
val_dataset = CustomDataset(val_data)

# Load a pre-trained BERT model
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')

# Set special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Set training parameters
model.config.vocab_size = model.config.encoder.vocab_size
model.config.max_length = 50
model.config.no_repeat_ngram_size = 2
model.config.early_stopping = True

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(f"Validation Loss: {results['eval_loss']}")

# Function to predict
#def predict(sample_data):
#    inputs = tokenizer.encode(sample_data, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
#    outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
##    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
 #   return prediction

# Example prediction


   CIVIL NUMBER DATE OF BIRTH  \
0    8478170552    29/10/1995   
1    8642905433    28/07/2001   
2    3687239949    24/05/1986   
3    4556950879    09/05/1993   
4    8814111616    22/09/1987   

                                         sample_data  
0  3pfe1L3JVY \\nSULTANATE OF OMAN\n\nRESIDENT\nC...  
1  3pfe1L3JVY \\nSULTANATE OF OMAN\n\nRESIDENT\nC...  
2  3pfe1L3JVY \\nSULTANATE OF OMAN\n\nRESIDENT\nC...  
3  3pfe1L3JVY \\nSULTANATE OF OMAN\n\nRESIDENT\nC...  
4  3pfe1L3JVY \\nSULTANATE OF OMAN\n\nRESIDENT\nC...  


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

Epoch,Training Loss,Validation Loss
1,0.3834,0.706379
2,0.0109,0.400964
3,0.0022,0.496927


Non-default generation parameters: {'max_length': 50, 'early_stopping': True, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 50, 'early_stopping': True, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 50, 'early_stopping': True, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 50, 'early_stopping': True, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 50, 'early_stopping': True, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder

Validation Loss: 0.4969269633293152


In [79]:
from transformers import EncoderDecoderModel, BertTokenizer

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EncoderDecoderModel.from_pretrained('./results/checkpoint-12000')  # Adjust path as needed

# Ensure model is in evaluation mode
model.eval()




EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [1]:
def generate_prediction(inputs):
    # Generate prediction
    outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
    return outputs


In [2]:
def generate_prediction(inputs):
    # Generate prediction
    outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
    return outputs


In [3]:
def decode_output(outputs):
    # Decode the output tokens to text
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction


In [4]:
def predict(sample_data):
    inputs = prepare_input(sample_data)
    outputs = generate_prediction(inputs)
    prediction = decode_output(outputs)
    return prediction


In [5]:
df['sample_data'].iloc[5]


NameError: name 'df' is not defined

In [92]:
# Example input text
sample_data = df['sample_data'].iloc[188]

# Make prediction
prediction = predict(sample_data)

print(f"Sample Data: {sample_data}")
print(f"Prediction: {prediction}")


Sample Data: 7uRFLTnhA  \
SULTANATE OF OMAN

RESIDENT
CARD

2DSSWWHSEAHU zWsBV



CIVIL NUMBER L83
ee Kyle :  4278059 Myjf
oe EE EXPIRY. DATE 25/08/2030 J2AWYm
signavune yx DATE OF  BIRTH 06/02/1963 A5zN
' ~ . ML
hen kml
HE QXA1g
Osh lwly6
steel yy a eagle 1h4QX





VEHICLE YsDBBIm

ROYAL DZFEu CLASS
D.C. OF iaxm STATUS


NOTE
ZLETSS Name Omar Yasir Hussain

ja Pen Y ttt
SSAEZZ” nationauty Sri Lankan -


IDOMN4278059<<<<<K<cKcccsee<
0219632M0320257SRI<<<<<<<<Kccc2
Omar<Yasir<Hussain<<<<<<<<<ccc<
Prediction: dob : 26 / 02 / 1963 civil : 4278059


In [95]:
print(df['sample_data'].iloc[1800])

K0I6r8YF6M \
SULTANATE OF OMAN

RESIDENT
CARD

BYKR8PUZVI1R II9k5



CIVIL NUMBER jxD
ee Kyle :  192394 Y4oo
oe EE EXPIRY. DATE 20/05/2025 6fPe2j
signavune yx DATE OF  BIRTH 09/09/1978 IK9v
' ~ . Ac
hen i7k
HE ScKlI
Osh zqMTW
steel yy a eagle LfhEs





VEHICLE LdgNbjQ

ROYAL 9waKF CLASS
D.C. OF TLMi STATUS


NOTE
ZLETSS Name Khalid Rashid Malik

ja Pen Y ttt
SSAEZZ” nationauty Nepali -


IDOMN192394<<<<<K<cKcccsee<
0919782M0620307NEP<<<<<<<<Kccc2
Khalid<Rashid<Malik<<<<<<<<<ccc<


In [63]:
def prepare_input(text):
    # Tokenize the input text
    inputs = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    return inputs

In [98]:
text="""

pariver \
SULTANATE OF OMAN

RESIDENT
CARD

ksdjflsnknkdf


CIVIL NUMBER jxD
dfknd  12837434 Y4oo
oe EE EXPIRY. DATE 20/05/2025 6fPe2j
signavune yx DATE OF  BIRTH 10/10/1956 IK9v
kdfnn 42
3kdnkdf
dfdfjdf
dslksdf



"""

In [100]:
#sample_data = df['sample_data'].iloc[188]

# Make prediction
prediction = predict(text)

print(f"Sample Data: {text}")
print(f"Prediction: {prediction}")


Sample Data: 

pariver SULTANATE OF OMAN

RESIDENT
CARD

ksdjflsnknkdf


CIVIL NUMBER jxD
dfknd  12837434 Y4oo
oe EE EXPIRY. DATE 20/05/2025 6fPe2j
signavune yx DATE OF  BIRTH 10/10/1956 IK9v
kdfnn 42
3kdnkdf
dfdfjdf
dslksdf




Prediction: dob : 30 / 10 / 1960 civil :37434
