In [1]:
tags = ["O", "CIT-NUM", "SEX", "B-NAME", "I-NAME", "YEAR", "MONTH", "DAY", "B-DISTRICT", "B-WARD", "B-NO", "I-DISTRICT", "I-WARD", "I-NO"]
id2label = {i: label for i, label in enumerate(tags)}
label2id = {v: k for k, v in id2label.items()}

In [2]:
import torch
from collections import Counter
from transformers import AutoModelForTokenClassification, AutoTokenizer


model = AutoModelForTokenClassification.from_pretrained("../model/V3/saved_model")
tokenizer = AutoTokenizer.from_pretrained("../model/V3/tokenizer_saved_model")



  from .autonotebook import tqdm as notebook_tqdm


In [14]:


text = "details citicensaip Certificate No 43017700315 Sex Male full Name NABIN RANABHAT Date of Birth AD Ye ar2002 MonthJUL Day24 © oirth Place District Tanahun R M Rishing Ward No6 © ermanent Address District Tanahun” R M Rishing Ward No6 Taawiea> awa dane wake! !!!”S witha aaa Fest."


inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

predicted_label_ids = outputs.logits.argmax(-1).squeeze()
predicted_labels = [id2label[id.item()] for id in predicted_label_ids]

tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids.squeeze())
word_ids = inputs.word_ids()


words = []
word_labels = []
current_word = ""
current_labels = []

for token, word_id, label in zip(tokens, word_ids, predicted_labels):
    if word_id is None:
        continue

    if word_id != len(words):

        if current_word:
            label_counts = Counter(current_labels)
            most_common_label = max(label_counts, key=label_counts.get)

            if "O" in label_counts and len(label_counts) > 1:
                del label_counts["O"]
                most_common_label = max(label_counts, key=label_counts.get)

            words.append((current_word, most_common_label))

        current_word = token.replace("##", "")
        current_labels = [label]
    else:

        current_word += token.replace("##", "")
        current_labels.append(label)


if current_word:
    label_counts = Counter(current_labels)
    most_common_label = max(label_counts, key=label_counts.get)

    if "O" in label_counts and len(label_counts) > 1:
        del label_counts["O"]
        most_common_label = max(label_counts, key=label_counts.get)

    words.append((current_word, most_common_label))

print(words)


[('details', 'O'), ('citicensaip', 'O'), ('certificate', 'O'), ('no', 'O'), ('43017700315', 'CIT-NUM'), ('sex', 'O'), ('male', 'SEX'), ('full', 'O'), ('name', 'O'), ('nabin', 'B-NAME'), ('ranabhat', 'I-NAME'), ('date', 'O'), ('of', 'O'), ('birth', 'O'), ('ad', 'O'), ('ye', 'O'), ('ar2002', 'YEAR'), ('monthjul', 'O'), ('day24', 'DAY'), ('©', 'O'), ('oirth', 'O'), ('place', 'O'), ('district', 'O'), ('tanahun', 'B-DISTRICT'), ('r', 'O'), ('m', 'O'), ('rishing', 'B-WARD'), ('ward', 'O'), ('no6', 'B-NO'), ('©', 'O'), ('ermanent', 'O'), ('address', 'O'), ('district', 'O'), ('tanahun', 'B-DISTRICT'), ('”', 'O'), ('r', 'O'), ('m', 'O'), ('rishing', 'B-WARD'), ('ward', 'O'), ('no6', 'B-NO'), ('taawiea', 'B-WARD'), ('>', 'O'), ('awa', 'O'), ('dane', 'O'), ('wake', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('”', 'O'), ('s', 'O'), ('witha', 'O'), ('aaa', 'O'), ('fest', 'O'), ('.', 'O')]


In [15]:
#this correxts the error if th I-district is before B-district then change it. so for ward
# Define our target labels for processing.
target_labels = {
    "DISTRICT": {"B-DISTRICT", "I-DISTRICT"},
    "WARD": {"B-WARD", "I-WARD"},
    "NO": {"B-NO", "I-NO"}
    
}

# Helper: Return the group if the label exactly matches one of our target labels.
def get_group(label):
    for group, labels in target_labels.items():
        if label in labels:
            return group
    return None

# Flag to indicate if we've reached the permanent section.
in_permanent = False
modified_words = []

for word, label in words:
    # Check if this token marks the start of permanent address.
    if word.lower() == "permanent":
        in_permanent = True
        modified_words.append((word, label))
        continue

    group = get_group(label)
    if group is not None:
        # If we're in the permanent section, force I-; otherwise, force B-
        if in_permanent:
            new_label = "I-" + group
        else:
            new_label = "B-" + group
        modified_words.append((word, new_label))
    else:
        modified_words.append((word, label))

# Replace your original words list with the modified one.
words = modified_words

print(words)


[('details', 'O'), ('citicensaip', 'O'), ('certificate', 'O'), ('no', 'O'), ('43017700315', 'CIT-NUM'), ('sex', 'O'), ('male', 'SEX'), ('full', 'O'), ('name', 'O'), ('nabin', 'B-NAME'), ('ranabhat', 'I-NAME'), ('date', 'O'), ('of', 'O'), ('birth', 'O'), ('ad', 'O'), ('ye', 'O'), ('ar2002', 'YEAR'), ('monthjul', 'O'), ('day24', 'DAY'), ('©', 'O'), ('oirth', 'O'), ('place', 'O'), ('district', 'O'), ('tanahun', 'B-DISTRICT'), ('r', 'O'), ('m', 'O'), ('rishing', 'B-WARD'), ('ward', 'O'), ('no6', 'B-NO'), ('©', 'O'), ('ermanent', 'O'), ('address', 'O'), ('district', 'O'), ('tanahun', 'B-DISTRICT'), ('”', 'O'), ('r', 'O'), ('m', 'O'), ('rishing', 'B-WARD'), ('ward', 'O'), ('no6', 'B-NO'), ('taawiea', 'B-WARD'), ('>', 'O'), ('awa', 'O'), ('dane', 'O'), ('wake', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('”', 'O'), ('s', 'O'), ('witha', 'O'), ('aaa', 'O'), ('fest', 'O'), ('.', 'O')]


In [16]:
new_words = []
tagss=[]
for idx , (word, label) in enumerate(words):
    tagss.append(label)
    if label not in ['B-NO', 'I-NO','O']:
        if tagss.count(label) > 1:
            continue
    new_words.append((word, label))

print(new_words)

    

[('details', 'O'), ('citicensaip', 'O'), ('certificate', 'O'), ('no', 'O'), ('43017700315', 'CIT-NUM'), ('sex', 'O'), ('male', 'SEX'), ('full', 'O'), ('name', 'O'), ('nabin', 'B-NAME'), ('ranabhat', 'I-NAME'), ('date', 'O'), ('of', 'O'), ('birth', 'O'), ('ad', 'O'), ('ye', 'O'), ('ar2002', 'YEAR'), ('monthjul', 'O'), ('day24', 'DAY'), ('©', 'O'), ('oirth', 'O'), ('place', 'O'), ('district', 'O'), ('tanahun', 'B-DISTRICT'), ('r', 'O'), ('m', 'O'), ('rishing', 'B-WARD'), ('ward', 'O'), ('no6', 'B-NO'), ('©', 'O'), ('ermanent', 'O'), ('address', 'O'), ('district', 'O'), ('”', 'O'), ('r', 'O'), ('m', 'O'), ('ward', 'O'), ('no6', 'B-NO'), ('>', 'O'), ('awa', 'O'), ('dane', 'O'), ('wake', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('!', 'O'), ('”', 'O'), ('s', 'O'), ('witha', 'O'), ('aaa', 'O'), ('fest', 'O'), ('.', 'O')]


In [None]:
#remove if the second tag is same tag is repeating two or more times.. but this is  also removing no:B-NO , 6:B-NO
# First, record the first index where each target group appears.
# first_occurrence = {}
# for idx, (word, label) in enumerate(words):
#     group = get_group(label)  # get_group returns "DISTRICT", "WARD", or "NO" if label is in target_labels.
#     if group is not None and group not in first_occurrence:
#         first_occurrence[group] = idx

# # Now, build a new list that only keeps the original label for the first occurrence of each group.
# final_words = []
# for idx, (word, label) in enumerate(words):
#     group = get_group(label)
#     if group is not None:
#         # Only the very first token with that target group keeps its label.
#         if first_occurrence[group] == idx:
#             final_words.append((word, label))
#         else:
#             final_words.append((word, "O"))
#     else:
#         final_words.append((word, label))

# print(final_words)



[('details', 'O'), ('citizenship', 'O'), ('certificate', 'O'), ('no', 'O'), ('43017700315', 'CIT-NUM'), ('sex', 'O'), ('male', 'SEX'), ('full', 'O'), ('name', 'O'), ('nabin', 'B-NAME'), ('ranabhat', 'I-NAME'), ('date', 'O'), ('of', 'O'), ('birth', 'O'), ('ad', 'O'), ('year', 'O'), ('2002', 'YEAR'), ('month', 'O'), ('jul', 'MONTH'), ('day', 'O'), ('24', 'DAY'), ('birth', 'O'), ('place', 'O'), ('district', 'O'), ('tanahun', 'B-DISTRICT'), ('municipality', 'O'), ('r', 'O'), ('m', 'O'), ('rishing', 'B-WARD'), ('ward', 'O'), ('no', 'B-NO'), ('6', 'O'), ('permanent', 'O'), ('address', 'O'), ('district', 'O'), ('tanahun', 'O'), ('r', 'O'), ('m', 'O'), ('rishing', 'O'), ('ward', 'O'), ('no', 'O'), ('6', 'O'), ('taawiea', 'O'), ('away', 'O'), ('dane', 'O'), ('wake', 'O'), ('s', 'O'), ('with', 'O'), ('aka', 'O'), ('fest', 'O'), ('.', 'O')]


In [17]:
import re

result = {}
full_name = []
citizenship_no = ""
gender = ""
year = ""
month = ""
day = ""
B_dist = ""
B_ward = ""
B_no = ""
P_dist = ""
P_ward = ""
P_no = ""

def extract_num(text):
    match = re.search(r'\d+', text)
    return match.group() if match else None
for word, label in new_words:
    if label == "CIT-NUM":
        citizenship_no += word
    elif label == "B-NAME":
        full_name.append(word)
    elif label == "I-NAME":
        full_name.append(word)
    elif label == "SEX":
      gender +=word
    elif label == "YEAR":
      year += word
    elif label == "MONTH":
      month += word
    elif label == "DAY":
      day += word
    elif label == "B-DISTRICT":
      B_dist += word
    elif label == "B-WARD":
      B_ward += word
    elif label == "B-NO":
      B_no += word
    elif label == "I-DISTRICT":
      P_dist += word
    elif label == "I-WARD":
      P_ward += word
    elif label == "I-NO":
      P_no += word

if citizenship_no:
    citizenship_number = citizenship_no[:2] + "-" + citizenship_no[2:4] + "-" + citizenship_no[4:6] + "-" + citizenship_no[6:]
    result["citizenship_num"] = citizenship_number
else :
  result["citizenship_num"] = None

if len(full_name) > 0:
    result["first_name"] = full_name[0]
else:
   result["first_name"] = None

if len(full_name) > 1:
    result["last_name"] = full_name[-1]
else:
  result["last_name"] = None
if len(gender)>1:
    result["gender"] = gender
else:
  result["gender"] = None
if year:
    result["Birth_year"] = year
    result["Birth_year"] = extract_num(result["Birth_year"])
else:
  result["Birth_year"] = None

if len(month) >1:
    result["Birth_month"] = month
else:
  result["Birth_month"] = None
if day:
    result["Birth_day"] = day
else:
  result["Birth_day"] = None
if B_dist:
    result["Birth_district"] = B_dist
else:
  result["Birth_district"] = None
if B_ward:
    result["Birth_ward"] = B_ward
else:
  result["Birth_ward"] = None
if B_no:
    result["Birth_wardno"] = B_no
    result["Birth_wardno"] = extract_num(result["Birth_wardno"])
else:
  result["Birth_wardno"] = None
if P_dist:
    result["Permanent_dist"] = P_dist
else:
  result["Permanent_dist"] = None
if P_ward:
    result["Permanent_ward"] = P_ward
else:
  result["Permanent_ward"] = None
if P_no:
    result["Permanent_wardno"] = P_no
    result["Permanent_wardno"] = extract_num(result["Permanent_wardno"])
else:
  result["Permanent_wardno"] = None


print(result)

{'citizenship_num': '43-01-77-00315', 'first_name': 'nabin', 'last_name': 'ranabhat', 'gender': 'male', 'Birth_year': '2002', 'Birth_month': None, 'Birth_day': 'day24', 'Birth_district': 'tanahun', 'Birth_ward': 'rishing', 'Birth_wardno': '6', 'Permanent_dist': None, 'Permanent_ward': None, 'Permanent_wardno': None}
