In [None]:
!pip install -q transformers datasets rouge_score evaluate accelerate scikit-learn pycountry

In [None]:
# Use Fine Tune on Train + Validarion model
fine_tune_model_dir = "/mnt/nas/HYZ/AICUP/save_finetune_model/"

# # base on 1701326984 hyperparameter Exact Match 94.286300 16/2e-5/0.01/15
# fine_tune_model_name = "1701406936-t5-efficient-base-dl2-finetuned-extracted-PHI"

# # base on 1700994394 hyperparameter, Exact Match 94.431500 4/2e-5/0.01/10
# fine_tune_model_name = "1701407311-t5-efficient-base-dl2-finetuned-extracted-PHI"

# # base on 1700994394 hyperparameter, Exact Match 94.567100, 4/2e-5/0.05/10
fine_tune_model_name = "1701489459-t5-efficient-base-dl2-finetuned-extracted-PHI"
config = {
    "model_checkpoint": f"{fine_tune_model_dir}{fine_tune_model_name}",
    "max_input_length": 512,
    "max_target_length": 100,
    "batch_size": 16,
    "generation_max_length": 100
}

In [None]:
import os
import torch
import torch.nn as nn

os.environ["CUDA_VISIBLE_DEVICES"] = "5"

print(torch.cuda.device_count())

1


In [None]:
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig
)
from datasets import load_dataset, DatasetDict, Dataset, load_from_disk, concatenate_datasets
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import numpy as np
import pycountry
import re
import json
import zipfile

[2023-12-03 04:07:02,524] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
# Loading dataset
file_path = '/mnt/nas/HYZ/AICUP/'
dataset = load_from_disk(f"{file_path}dataset_dict_v2")

dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 92933
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 10326
    })
    test: Dataset({
        features: ['prompt'],
        num_rows: 47084
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["model_checkpoint"])
model = AutoModelForSeq2SeqLM.from_pretrained(config["model_checkpoint"])

In [None]:
def preprocess_function(examples):
    """
    Preprocesses the input data for training or evaluating the T5 model.

    This function tokenizes the inputs and labels (if available) using the specified tokenizer.
    It's designed to be used with datasets in the Hugging Face 'datasets' library,
    where each item is a dictionary with 'prompt' and optionally 'completion' keys.

    Parameters:
    examples (dict): A dictionary containing 'prompt' and optionally 'completion' keys.
                     The values are lists of strings: the inputs and the expected outputs for the model.

    Returns:
    dict: A dictionary with tokenized 'input_ids' and optionally 'labels' for training/evaluation.

    The function tokenizes 'prompt' to create the model inputs.
    If 'completion' is present, it's also tokenized to create the labels for training.
    For labels, padding tokens are replaced with -100 to ignore them in the loss computation.
    """

    # Tokenize the input text
    model_inputs = tokenizer(examples["prompt"], padding="max_length", max_length=config["max_input_length"], truncation=True)

    if "completion" in examples:
        # Tokenize the labels (if present)
        labels = tokenizer(examples["completion"], padding="max_length", max_length=config["max_target_length"], truncation=True)

        # Replace padding token id with -100 in labels
        labels["input_ids"] = [
            [(label if label != tokenizer.pad_token_id else -100) for label in label_example] for label_example in labels["input_ids"]
        ]

        # Add labels to model inputs
        model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Applying the preprocessing function to the datasets
tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/92933 [00:00<?, ? examples/s]

Map:   0%|          | 0/10326 [00:00<?, ? examples/s]

Map:   0%|          | 0/47084 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['prompt', 'input_ids', 'attention_mask']


In [None]:
# Setting up training arguments for fine-tuning
model_list = config["model_checkpoint"].split("/")[-1].split("-")
model_name = model_list[0]
print(f"Model List: {model_list}")
print(f"Model Name: {model_name}")

Model List: ['1701489459', 't5', 'efficient', 'base', 'dl2', 'finetuned', 'extracted', 'PHI']
Model Name: 1701489459


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=Seq2SeqTrainingArguments(
        # output_dir="finetune-model-predicted",
        output_dir="Result",
        per_device_eval_batch_size=config["batch_size"]*config["batch_size"],
        predict_with_generate=True,
        generation_max_length=config["generation_max_length"],
    ),
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8),
    tokenizer=tokenizer,
)

test_predictions = trainer.predict(tokenized_dataset["test"])

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
test_predictions[0][0]

array([    0,  6632,    10,    27, 12145,  6122,     6,  7185,    10,
         335,   434,  2445,  3959,  2884,   329,  1820,  6632,    10,
          27, 12145,  6122,     6,  7185,    10,   335,   434,  2445,
        3959,   357,     3, 14920,     1,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0])

In [None]:
# Decoding and filtering the test predictions
decoded_predictions = []
for pred in test_predictions.predictions:
    filtered_pred = [token_id for token_id in pred if token_id != -100 and 0 <= token_id < tokenizer.vocab_size]
    decoded_text = tokenizer.decode(filtered_pred, skip_special_tokens=True)
    decoded_predictions.append(decoded_text)

print(decoded_predictions[0])

Type: IDNUM, Content: 10L407622M | Type: IDNUM, Content: 10L40762 END


## Save Result

In [None]:
# Creating a DataFrame with prompts and their decoded completions
test_df = pd.DataFrame({
    "prompt": tokenized_dataset["test"]["prompt"],
    "completion": decoded_predictions
})

# test_df

In [None]:
# Save jsonl
output_file = f"./Result/{model_name}-{config['generation_max_length']}-original_predicted.jsonl"
test_df.to_json(output_file, orient="records", lines=True)

# Rule-base

In [None]:
# Sub-function: Extract Phone Number
def extracted_phone(row):
    """
    Extracts phone numbers in the format 'dddd dddd' from the 'prompt' field of a given row.
    If a phone number is found, the row's 'completion' field is updated with the phone number details.
    """
    phone_pattern = r'\b\d{4} \d{4}\b'
    phones = re.findall(phone_pattern, row['prompt'])
    if phones:
        row['completion'] = f"Type: PHONE, Content: {phones[0]} END"
    return row

# Sub-function: Extract Country Name
def extracted_country(row, countries, exceptions):
    """
    Extracts country names from the 'prompt' field of a given row, considering certain rules and exceptions.
    Skips cases where 'Dr', 'DR', or 'PRO' is followed by a country name and handles special word patterns.
    """
    # Check if the string contains "Dr", "DR", or "PRO"
    if re.search(r'\b(Dr|DR|PRO)\b', row['prompt']):
        # Further checks if the word following "Dr", "DR", or "PRO" is a country name
        dr_or_pro_match = re.search(r'\b(Dr|DR|PRO)\s+(\w+)', row['prompt'])
        if dr_or_pro_match and dr_or_pro_match.group(2) in countries:
            return row  # If a country name is matched, skip

    # Check for country names following "with"
    if re.search(r'\bwith\s+(\w+)', row['prompt']):
        with_match = re.search(r'\bwith\s+(\w+)', row['prompt'])
        if with_match and with_match.group(1) in countries:
            if re.search(r'\bwith\s+\w+\s+\(\w+\)', row['prompt']):
                return row

    # Check for exceptions
    if any(exception in row['prompt'] for exception in exceptions):
        return row

    # Check for country names in specific patterns
    if re.search(r'\bwith\s+(\w+)(?!\s+\()', row['prompt']):
        with_match = re.search(r'\bwith\s+(\w+)(?!\s+\()', row['prompt'])
        if with_match and with_match.group(1) in countries:
            return row
    return row

# Sub-function: Extract Other Locations
def extracted_location_other(row):
    """
    Extracts other location details like P.O. BOX numbers and specific words from the 'prompt' field of a row.
    Updates the 'completion' field with these details if found.
    """
    po_box_pattern = r'\bP\.?\s*O\.?\s*BOX\s+\d+\b'
    janborwill_pattern = r'\bJANBORWILL\b'
    po_box_matches = re.findall(po_box_pattern, row['prompt'], re.IGNORECASE)
    janborwill_matches = re.findall(janborwill_pattern, row['prompt'], re.IGNORECASE)

    contents = po_box_matches + janborwill_matches
    if contents:
        row['completion'] = f"Type: LOCATION-OTHER, Content: {', '.join(contents)} END"

    return row

# Sub-function: Extract and Normalize Duration
def extract_and_normalized_duration(row):
    """
    Extracts and normalizes duration details from the 'prompt' field of a row.
    Converts textual duration descriptions to a standard format.
    """
    word_to_number = {'one': '1', 'two': '2', 'three': '3', 'four': '4',
                      'twenty': '20', 'forty': '40'}

    duration_pattern = r'\b((\d+(-\d+)?|one|two|three|twenty|four)\s+(months?|years?|yrs?|weeks?|wks?|days?)|(\d+)(?<!\s)(yr|year))(?!\s*(old|\s*-|\s*cycle|\s*cycles))\b'
    duration_matches = re.findall(duration_pattern, row['prompt'], re.IGNORECASE)

    normalized_durations = []
    for match in duration_matches:
        text_num = match[1] if match[1] else match[4]
        unit_text = match[3] if match[3] else 'year'

        # Convert textual number descriptions to numeric format
        num = word_to_number.get(text_num.lower(), text_num)
        unit = unit_text[0].upper()

        normalized = "P" + num + unit
        duration_format = f"{text_num} {unit_text}"
        normalized_durations.append(f"{duration_format} (Normalized: {normalized})")

    if normalized_durations:
        row['completion'] = f"Type: DURATION, Content: {', '.join(normalized_durations)} END"

    return row


# Integrated Function: Extract Information
def extract_info(df):
    """
    Integrates various sub-functions to extract different types of information from a DataFrame.
    Handles country names, phone numbers, other locations, and durations.
    """
    countries = [country.name for country in pycountry.countries]
    countries.sort(key=len, reverse=True)
    countries.append("Vietnam") # pycountry is Viet Nam
    countries.append("USA")
    exceptions = ["Western Australia", "South Australia", "Congo red", "Congo Red"]

    tqdm.pandas()

    for func in [extracted_phone, lambda row: extracted_country(row, countries, exceptions), extracted_location_other, extract_and_normalized_duration]:
        df = df.progress_apply(func, axis=1)
    return df

# Applying the Function
test_df = extract_info(test_df)
test_df

100%|██████████| 47084/47084 [00:02<00:00, 21619.40it/s]
100%|██████████| 47084/47084 [00:03<00:00, 12598.18it/s]
100%|██████████| 47084/47084 [00:02<00:00, 20216.91it/s]
100%|██████████| 47084/47084 [00:02<00:00, 18242.52it/s]


Unnamed: 0,prompt,completion
0,Please extract HIPAA related information from ...,"Type: IDNUM, Content: 10L407622M | Type: IDNUM..."
1,Please extract HIPAA related information from ...,"Type: MEDICALRECORD, Content: 10195149 END"
2,Please extract HIPAA related information from ...,"Type: HOSPITAL, Content: COLAC AREA HEALTH END"
3,Please extract HIPAA related information from ...,PHI: NULL
4,Please extract HIPAA related information from ...,PHI: NULL
...,...,...
47079,Please extract HIPAA related information from ...,PHI: NULL
47080,Please extract HIPAA related information from ...,PHI: NULL
47081,Please extract HIPAA related information from ...,PHI: NULL
47082,Please extract HIPAA related information from ...,PHI: NULL


In [None]:
# Save jsonl
output_file = f"./Result/{model_name}-{config['generation_max_length']}-after_rule_predicted.jsonl"
test_df.to_json(output_file, orient="records", lines=True)

In [None]:
# Filtering out entries where the completion is 'PHI: NULL'
test_df = test_df[test_df['completion'] != 'PHI: NULL']
test_df = test_df.reset_index(drop=True)
# test_df

In [None]:
# Function to Split Multiple Answers in the 'Completion' Column into Separate Rows
def split_completions(row):
    """
    Splits entries in the 'completion' column of a given row that contain multiple answers.
    Each answer is separated into a new row, maintaining other details from the original row.
    """
    completions = row['completion'].split(' | ')
    new_rows = []
    for comp in completions:
        # Removes the 'END' tag from each completion before reappending it
        if 'END' in comp:
            comp = comp.replace(' END', '')
        new_row = row.copy()
        new_row['completion'] = comp + ' END'
        new_rows.append(new_row)
    return new_rows

# Applying the function to each row and flattening the results into a new DataFrame
split_rows = [split_completions(row) for index, row in test_df.iterrows()]
flat_list = [item for sublist in split_rows for item in sublist]
new_df = pd.DataFrame(flat_list)

In [None]:
# Converting the DataFrame to a JSON format for submission
predictions_json = new_df.to_dict(orient='records')
predictions_json = [{"prompt": record["prompt"], "completion": record["completion"]} for record in predictions_json]

# Show top n items for checking
predictions_json[0:2]

[{'prompt': 'Please extract HIPAA related information from the given text: file ID:file35016, start:0, content:SPR no: 10L407622M\n\n###\n\n',
  'completion': 'Type: IDNUM, Content: 10L407622M END'},
 {'prompt': 'Please extract HIPAA related information from the given text: file ID:file35016, start:0, content:SPR no: 10L407622M\n\n###\n\n',
  'completion': 'Type: IDNUM, Content: 10L40762 END'}]

In [None]:
def normalize_set(data):
    """
    Normalizes specific patterns in the 'completion' field of each item in the provided dataset.
    It specifically targets and normalizes occurrences of 'once', 'twice', and 'three times'.
    """
    normalization_map = {
        'once': 'R1',
        'twice': 'R2',
        'three times': 'R3',
    }

    for item in data:
        set_strings = re.findall(r'Type: SET, Content: (.*?) \(Normalized: (.*?)\)', item["completion"])

        for set_str, old_normalized in set_strings:
            normalized_str = normalization_map.get(set_str, set_str)
            new_completion = f"Type: SET, Content: {set_str} (Normalized: {normalized_str})"
            item["completion"] = item["completion"].replace(
                f"Type: SET, Content: {set_str} (Normalized: {old_normalized})",
                new_completion
            )
            print("Original:", set_str, "| Old normalized:", old_normalized, "| New:", new_completion)

    return data

normalized_set = normalize_set(predictions_json)

Original: twice | Old normalized: R2 | New: Type: SET, Content: twice (Normalized: R2)
Original: twice | Old normalized: R2 | New: Type: SET, Content: twice (Normalized: R2)
Original: twice | Old normalized: R2 | New: Type: SET, Content: twice (Normalized: R2)


In [None]:
def convert_to_iso(datetime_str):
    if pd.notna(datetime_str):
        formats = [
            "%d/%m/%Y at %H:%Mhr", "%Y-%m-%d %H:%M:%S", "%d/%m/%Y %H:%M:%S", "%d/%m/%Y %H:%M", "%I:%M%p on %d/%m/%Y",
            "%I%M%p on %d.%m.%y", "%I.%M%p on %d.%m.%y", "%d/%m/%y on %I%p", "%I:%M%p on the %d.%m.%Y", "%I:%M%p on %d/%m/%y",
            "%d/%m/%Y at %H:%M", "%d.%m.%y at %I:%M %p",  "%I.%M%p on %d.%m.%y", "%d.%m.%y on %I:%M %p", "%I:%M%p on %d.%m.%y",
            "%I:%M%p on %d/%m/%y", "%d/%m/%Y %I:%M %p", "%H:%M on %d/%m/%y", "%I:%M%p at %d/%m/%Y", "%H:%M on %m/%d/%Y",
            "%I:%M%p on %d.%m.%Y", "%I:%M%p on %d/%m/%y", "%d/%m/%y", "%H%M%hrs on %d.%m.%y", "%H:%M on %d/%m/%y",
            "%I:%M%p on %d/%m/%y", "%I:%M%p on %m/%d/%Y", "%H:%M:%S on %d/%m/%Y", "%H:%M:%S on %m/%d/%Y", "%I:%M%p on %m/%d/%Y",
            "%H:%M on %m/%d/%Y", "%H:%M:%S on %d/%m/%Y", "%H:%M:%S on %m/%d/%Y", "%H:%M:%S on %d.%m.%Y", "%H:%M:%S on %m.%d.%Y",
            "%I:%M%p on %m.%d.%Y", "%H:%M:%S on %d/%m/%y", "%H:%M:%S on %m/%d/%y", "%I:%M%p on %m/%d/%y", "%H:%M:%S %p on %d/%m/%Y",
            "%H:%M:%S %p on %m/%d/%Y", "%I:%M:%S %p on %d.%m.%Y", "%H:%M:%S %p on %m.%d.%Y",  "%I:%M:%S %p on %d/%m/%y", "%H:%M:%S %p on %m/%d/%y",
            "%I:%M:%S %p on %m/%d/%y", "%H:%M:%S %p on %d/%m/%Y", "%H:%M:%S %p on %m/%d/%Y", "%I:%M:%S %p on %d.%m.%Y",
            "%I:%M:%S %p on %m.%d.%Y", "%I:%M:%S %p on %d/%m/%y", "%I:%M:%S %p on %m/%d/%y", "%I:%M:%S %p on %d.%m.%y",
            "%I:%M:%S %p on %m.%d.%y", "%I:%M %p on %d/%m/%Y", "%I:%M %p on %m/%d/%Y", "%I:%M %p on %d.%m.%Y", "%I:%M %p on %m.%d.%Y",
            "%I:%M %p on %d/%m/%y", "%I:%M %p on %m/%d/%y", "%I:%M %p on %d.%m.%y", "%I:%M %p on %m.%d.%y",
            "%H:%M on %d.%m.%y", "%I:%M %p on %d/%m/%Y", "%I:%M %p on %m/%d/%Y", "%I:%M %p on %d.%m.%Y", "%I:%M %p on %m.%d.%Y",
            "%I:%M %p on %d/%m/%y", "%I:%M %p on %m/%d/%y", "%I:%M %p on %d.%m.%y", "%I:%M %p on %m.%d.%y", "%H:%M:%S on %d.%m.%y",
            "%H:%M:%S %p on %d.%m.%y", "%H:%M:%S on %m.%d.%y", "%H:%M:%S %p on %m.%d.%y", "%I:%M %p on %m/%d/%Y",
            "%H:%M:%S %p on %m/%d/%Y", "%I:%M %p on %d.%m.%Y", "%H:%M:%S %p on %d.%m.%Y", "%I:%M %p on %m/%d/%y",
            "%H:%M:%S %p on %m/%d/%y", "%I:%M %p on %d.%m.%y", "%H:%M:%S %p on %d.%m.%y", "%I:%M %p on %m.%d.%y",
            "%H:%M:%S %p on %m.%d.%y", "%I:%M %p on %d.%m.%Y", "%H:%M:%S %p on %d.%m.%Y", "%I:%M %p on %m.%d.%Y",
            "%H:%M:%S %p on %m.%d.%Y", "%H:%M:%S %p on %d.%m.%Y", "%H:%M:%S %p on %m.%d.%Y", "%H:%M:%S %p on %d.%m.%y",
            "%H:%M:%S %p on %m.%d.%y", "%H:%M:%S %p on %d.%m.%Y", "%H:%M:%S %p on %m.%d.%Y", "%I:%M %p on %d.%m.%y",
            "%H:%M:%S %p on %d.%m.%y", "%I:%M %p on %m.%d.%y", "%H:%M:%S %p on %m.%d.%y", "%I:%M %p on %d.%m.%Y",
            "%H:%M:%S %p on %d.%m.%Y", "%I:%M %p on %m.%d.%Y", "%H:%M:%S %p on %m.%d.%Y", "%I:%M %p on %d.%m.%Y",
            "%H:%M:%S %p on %d.%m.%Y", "%I:%M %p on %m.%d.%Y", "%H:%M:%S %p on %m.%d.%Y", "%H:%M %p on %d.%m.%y",
            "%H:%M:%S %p on %d.%m.%y", "%H:%M:%S %p on %m.%d.%y", "%H:%M:%S %p on %d.%m.%Y", "%H:%M:%S %p on %m.%d.%Y",
            "%H:%M %p on %d.%m.%Y", "%H:%M:%S %p on %d.%m.%Y", "%H:%M %p on %m.%d.%Y", "%H:%M:%S %p on %m.%d.%Y",
            "%I:%M %p on %d.%m.%Y", "%H:%M:%S %p on %d.%m.%Y", "%I:%M %p on %m.%d.%Y", "%H:%M:%S %p on %m.%d.%Y",
            "%I:%M %p on %d.%m.%y", "%H:%M:%S %p on %d.%m.%y", "%I:%M %p on %m.%d.%y", "%H:%M:%S %p on %m.%d.%y",
            "%H%Mhrs on %d.%m.%y", "%H%Mhrs on %d.%m.%Y", "%H:%Mhrs on %d.%m.%Y", "%I:%M %p on the %d/%m/%y",
            "%d/%m/%Y", "%H.%M on %d/.%m.%y", "%d/%m/%Yat %H:%M", "%I.%M%p on %d/%m/%y", "%H.%M on %d.%m.%y",
            "%H%Mhr on %d/%m/%y", "%H.%M%p on %d.%m.%y", "%H.%M%p on %d/%m/%y", "%H.%M on %d/%m/%y",
            "%I%p on %d.%m.%y", "%H.%M o %d.%m.%y", "%I:%M%p on %d.%m.%y", "%H:%M%p on %d/%m/%y",
            "%I:%M%p on %d/%m/%y", "%H:%M%p on %d.%m.%y", "%H:%M on the %dth of %B %Y", "%H%M on the %d/%m/%y",
            "%H:%M on %d.%m.%Y", "%d.%m.%Y at %I:%M%p", "%d/%m/%y at %I:%M%p", "%H%M%hrs on %D.%M.%Y", "%d.%m.%y at %I:%M%p",
            "%H%M on the %d of %B %Y", "%I:%M%p on the %d of %B %Y", "%d/%m/%y %I:%M %p", "%dth of %B %Y at %I:%M%p",
            "%H:%M on the %d/%m/%y", "%d/%m/%y at %I%p", "%H:%Mhrs on %d/%m/%y", "%I%M on the %dth of %B %Y",
            "%I:%M%p on the %dth of %B %Y", "%I:%M%p on the %dst of %B %Y", "%I:%M%p on the %dnd of %B %Y", "%I:%M%p on the %drd of %B %Y",
            "%H%M on the %dth of %B %Y", "%d.%m.%y", "%I:%M%p on %d/%m/%Y", "%I:%M%p on %d/%m/%Y", "%I.%M%p on %d/%m/%Y", "%I:%M%p on %d/%m/%y",
            "%I.%M%p on %d/%m/%y", "%H:%M%p on %d/%m/%Y", "%H.%M%p on %d/%m/%Y", "%H:%M%p on %d/%m/%y", "%H.%M%p on %d/%m/%y",
            "%I:%M%p on %m/%d/%Y", "%I.%M%p on %m/%d/%Y", "%I:%M%p on %m/%d/%y", "%I.%M%p on %m/%d/%y", "%H:%M%p on %m/%d/%Y",
            "%H.%M%p on %m/%d/%Y", "%H:%M%p on %m/%d/%y", "%H.%M%p on %m/%d/%y", "%I:%M%p on %d/%m/%Y", "%I.%M%p on %d/%m/%Y",
            "%I:%M%p on %d/%m/%y", "%I.%M%p on %d/%m/%y", "%H:%M%p on %d/%m/%Y", "%H.%M%p on %d/%m/%Y", "%H:%M%p on %d/%m/%y",
            "%H.%M%p on %d/%m/%y", "%d.%m.%y at %I:%M %p","%I.%M%p, %d/%m/%y", "%H:%M%p on %d.%m.%Y", "%I.%M %p on %d/%m/%y",
            "%d.%m.%Y at %I:%M%p", "%d/%m/%Y at %I:%M%p", "%d.%m.%Y at %I;%M %p", "%m/%d/%Y at %H:%M", "%I%M%p on %d/%m/%Y",
            "%I%p on %d/%m/%y", "%I%M%p on %d.%m.%y", "%I:%Mhrs %d/%m/%y", "%I%Mh on %d/%m/%Y", "%H:%M hrs on %d/%m/%y", "%H:%Mhr on %d/%m/%Y",
            "%I%Mhrs on %d/%m/%Y", "%HM on %d.%m.%y", "%H%M on %d.%m.%y", "%H:%M Hrs on %d.%m.%y", "%I%p on %d.%m.%Y",
            "%d/%m/%Y at %I:%M%p", "%I:%M hours on %d/%m/%y", "%I:%M%Hrs on %d.%m.%y", "%I.%M%p, %d/%m/%y", "%d.%m.%y %I%p", "%I.%Mhrs on %d.%m.%y",
            "%I.%Mhrs on %d.%m.%y", "%I:%Mhrs, %d/%m/%y", "%d/%m/%Y at%H:%M", "%H:%M %d/%m/%y", "%H:%M %d/%m/%y", "%I:%M%p on %d-%b-%Y",
            "%I:%Mhrs on %d/%m/%Y", "%I:%Mhrs on %d/%m/%Y", "%I:%Mhrs on %d-%b-%Y", "%I%Mhrs on %d/%m/%Y", "%I%p on %d/%m/%y", "%d.%m.%y %I.%Mhrs",
            "%d/%m/%y %I:%M%p", "%H%M%p on %d.%m.%y", "%I%M%p on %d.%m.%y","%I%p on %d.%m.%y", "%I.%M%p on %d.%m.%y",
            "%H:%M %p on %d.%m.%y", "%d.%m.%Y at %I:%M %p",
            "%H%MH on %d.%m.%y", "%I%M%p on %d/%m/%Y", "%I%M%p on %d.%m.%y", "%H%Mh on %d/%m/%Y", "%H%Mhrs on %d/%m/%Y", "%H:%MH on %d/%m/%y",
            "%H:%MHrs on %d.%m.%y", "%H.%Mhrs on %d.%m.%y", "%H:%Mhr on %d/%m/%y", "%H%Mhrs on %d/%m/%Y", "%H.%M hrs on %d.%m.%y",
            "%d.%m.%Y at %I:%M %p", "%d.%m.%Y at %I:%M%p", "%d/%m/%Y at%H:%M",
            "%d.%m.%Y at %I.%M%p", "%I%M%p on %d/%m/%Y", "%m/%d/%Y at%H:%M", "%I:%M%p on the %d-%b-%Y", "%H:%Mhrs on the %d-%b-%Y"
        ]

        for fmt in formats:
            try:
                parsed_date_time = datetime.strptime(datetime_str, fmt)
                if "%S" in fmt:
                    return parsed_date_time.isoformat()
                else:
                    return parsed_date_time.strftime("%Y-%m-%dT%H:%M")
            except ValueError:
                pass

        return datetime_str

def convert_to_desired_format(date_str):
    """
    Converts various date string formats to a standardized YYYY-MM-DD format or YYYY format.
    It handles different separator characters and year, month, day order variations.
    """
    regex_patterns = [
        r"\b(\d{1,2})[./](\d{1,2})[./](\d{2})\b",   # DD.MM.YY or MM/DD/YY
        r"\b(\d{1,2})[./](\d{1,2})[./](\d{4})\b",   # DD.MM.YYYY or MM/DD/YYYY
        r"\b(\d{4})\b",                             # YYYY
        r"\b(\d{4})(\d{2})(\d{2})\b",               # YYYYMMDD
        r"\b(\d{1,2})[./](\d{1,2})(\d{4})\b",       # DDMMYYYY without separators
        r"\b(\d{4})(\d{2})(\d{3})\b"                # YYYYMMDDD
    ]
    for pattern in regex_patterns:
        match = re.search(pattern, date_str)
        if match:
            groups = match.groups()
            if pattern == regex_patterns[0] or pattern == regex_patterns[1]:
                return f"{groups[2]:0>4}-{groups[1]:0>2}-{groups[0]:0>2}"
            elif pattern == regex_patterns[2]:
                return f"{groups[0]:0>4}"
            elif pattern == regex_patterns[3]:
                return f"{groups[0]}-{groups[1]:0>2}-{groups[2]:0>2}"
            elif pattern == regex_patterns[4]:
                return f"{groups[2]:0>4}-{groups[1]:0>2}-{groups[0]:0>2}"
            elif pattern == regex_patterns[5]:
                return f"{groups[0]}-{groups[1]:0>2}-{groups[2][:2]}"
    return date_str

def replace_leading_zero(date_str):
    """
    Replaces a leading zero in a date string with '2'.
    Useful for correcting certain date formats where the century might be incorrectly represented.
    """
    if date_str.startswith("0"):
        return "2" + date_str[1:]
    else:
        return date_str

def process_date_time(data):
    """
    Processes and normalizes date and time strings in the 'completion' field of each item in the dataset.
    Uses separate functions to convert date and time strings to standard formats.
    """
    for item in data:
        # Find all date or time strings in the 'completion' field
        datetime_strings = re.findall(r'Type: (DATE|TIME), Content: (.*?) \((Normalized: )?(.*?)\)', item["completion"])

        for dtype, dt_str, _, old_normalized in datetime_strings:
            if dtype == "TIME":
                # Normalize time strings (implement convert_to_iso separately)
                normalized_str = convert_to_iso(dt_str)
            elif dtype == "DATE":
                # Normalize date strings and replace leading zero if necessary
                normalized_str = convert_to_desired_format(dt_str)
                normalized_str = replace_leading_zero(normalized_str)
            else:
                continue

            new_completion = f"Type: {dtype}, Content: {dt_str} (Normalized: {normalized_str})"
            # Update the 'completion' field with the new normalized string
            item["completion"] = item["completion"].replace(
                f"Type: {dtype}, Content: {dt_str} (Normalized: {old_normalized})",
                new_completion
            )

    return data

normalized_date_time = process_date_time(normalized_set)
print('Done')

Done


In [None]:
def clean_and_reformat_content(predictions_json):
    """
    Cleans and reformats the content from the predictions_json for submission.
    Extracts necessary information from each json entry and creates a tab-separated format.
    """
    reformatted_content = []

    for json_entry in predictions_json:
        # Search for file ID, start position, and content text in the prompt
        prompt_search = re.search(r'file ID:([a-zA-Z]*\d+), start:(\d+), content:(.+)\n\n###\n\n', json_entry['prompt'])
        if not prompt_search:
            continue
        file_id, start_pos, content_text = prompt_search.groups()
        start_pos = int(start_pos)

        # Find all matches of the pattern in the 'completion' field
        matches = re.findall(r'Type: ([\w-]+), Content: (.*?)(?:\s+\(Normalized: ([^\)]+)\))? END', json_entry['completion'])
        for match in matches:
            tag_type, text_info, normalized_date = match
            # Find the start index of the text_info in the content_text
            relative_start_index = content_text.find(text_info.strip())
            if relative_start_index == -1 and tag_type == "TIME":
                # Special handling for time type to match the format
                time_match = re.search(r'\d{1,2}:\d{2}\w*m on \d{1,2}\.\d{1,2}\.\d{2}', content_text)
                if time_match:
                    text_info = time_match.group()
                    relative_start_index = content_text.find(text_info.strip())

            if relative_start_index != -1:
                # Calculate the actual start and end index
                actual_start_index = start_pos + relative_start_index
                actual_end_index = actual_start_index + len(text_info.strip())

                # Create a reformatted line for each match
                reformatted_line = f"{file_id}\t{tag_type}\t{actual_start_index}\t{actual_end_index}\t{text_info.strip()}"
                if normalized_date:
                    reformatted_line += f"\t{normalized_date}"
                reformatted_line += "\n"
                reformatted_content.append(reformatted_line)

    return reformatted_content

reformatted_content = clean_and_reformat_content(normalized_date_time)

In [None]:
# Writing the reformatted content to a text file
txt_file_path = f"./Result/{model_name}-{config['generation_max_length']}-answer.txt"

with open(txt_file_path, 'w') as file:
    for line in reformatted_content:
        file.write(line)

In [None]:
zip_file_path = f"./Result/{model_name}-{config['generation_max_length']}-answer.txt.zip"

# Save zip file
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    zipf.write(txt_file_path, arcname='answer.txt')
