In [None]:
!pip install git+https://github.com/huggingface/peft
!pip install accelerate transformers einops datasets peft bitsandbytes
from kaggle_secrets import UserSecretsClient
import os
user_secrets = UserSecretsClient()
githubkey = user_secrets.get_secret("GITHUB")

!rm -rf AIC2024_Phi_version
command = f'''git clone https://{githubkey}@github.com/phuvinhnguyen/AIC2024_Phi_version.git'''
os.system(command)
import sys
sys.path.append("AIC2024_Phi_version")

In [2]:
import torch
from dataset.dataset import AICDataset, CombineAICDataset
from dataset.collator import VidLangCollator
from transformers import AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from model.phiver_configuration import *
from model.modeling_phiver import *
from peft import LoraConfig, get_peft_model

if torch.cuda.is_available():
    # Set the device to CUDA (GPU)
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    # Set the device to CPU
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

# Load phi tokenizer
check_point = 'microsoft/phi-1_5'
tokenizer = AutoTokenizer.from_pretrained(check_point)

def count_parameters(model):
    print(f'trainable_params: {sum(p.numel() for p in model.parameters() if p.requires_grad)}')
    print(f'total_params: {sum(p.numel() for p in model.parameters())}')

# Change following paths to your own
TEST_CAPTIONS_DIR = '/kaggle/input/aic24-test/WTS_DATASET_PUBLIC_TEST/WTS_DATASET_PUBLIC_TEST/annotations/caption/test/public_challenge'
TEST_CAPTIONS_TRIM_DIR = '/kaggle/input/aic24-test/WTS_DATASET_PUBLIC_TEST/WTS_DATASET_PUBLIC_TEST/annotations/caption/test/public_challenge/normal_trimmed'
TEST_CAPTIONS_EXTERNAL_DIR = '/kaggle/input/aic24-test/WTS_DATASET_PUBLIC_TEST/WTS_DATASET_PUBLIC_TEST/external/BDD_PC_5K/annotations/caption/test/public_challenge'

TEST_VIDEOS_DIR = '/kaggle/input/aic24-videofeatures-test/matrix/train'
TEST_VIDEO_EXTERNAL_DIR = '/kaggle/input/aic24vidtestexternal/matrix/testexternal'

test_dataset = AICDataset(tokenizer=tokenizer, captions_path=TEST_CAPTIONS_DIR, videos_path=TEST_VIDEOS_DIR, is_test=True, device=device)
test_trim_dataset = AICDataset(tokenizer=tokenizer, captions_path=TEST_CAPTIONS_TRIM_DIR, videos_path=TEST_VIDEOS_DIR, is_test=True, device=device)
test_exte_dataset = AICDataset(tokenizer=tokenizer, captions_path=TEST_CAPTIONS_EXTERNAL_DIR, videos_path=TEST_VIDEO_EXTERNAL_DIR, is_test=True, dataset_type=1, device=device)
test_full_dataset = CombineAICDataset([test_dataset, test_trim_dataset, test_exte_dataset])

# Use DoRA for finetune llm
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'dense'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

config = PhiverConfig()
model = VideoPhiForCausalLM(config)
model.add_adapter(lora_config)

# Load checkpoint of the model (torch checkpoint)
model.load_state_dict(torch.load('/kaggle/input/aic24-phiver-checkpoint/original'))

count_parameters(model)

2024-03-22 16:10:20.624885: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-22 16:10:20.624984: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-22 16:10:20.749189: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


CUDA is available. Using GPU.


  output_pedestrian['inputs_embeds'] = torch.tensor(output_pedestrian['event_embeds']).unsqueeze(0).to(self.device)
  output_vehicle['inputs_embeds'] = torch.tensor(output_vehicle['event_embeds']).unsqueeze(0).to(self.device)


config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

trainable_params: 6754304
total_params: 1568489728


In [3]:
import json
from tqdm import tqdm

model.to(device).eval()

file_path = "./predict.json"
json_objects = []

collator = VidLangCollator(tokenizer, mlm=False)
collator.inference = True

batch_size = 8
start = 0

for index in tqdm(range(start, len(test_full_dataset), batch_size)):
    
    final_index = index + batch_size
    if final_index >= len(test_full_dataset):
        final_index = len(test_full_dataset)
    
    datas = [test_full_dataset[i] for i in range(index, final_index)]
    __ids = [data.pop('id') for data in datas]
    phases = [data['phase'] for data in datas]
    times = [data['time'] for data in datas]
    
    inputs = collator(datas)
    
    output = model.generate(**inputs, max_length=200)
    
    texts = tokenizer.batch_decode(output)
    
    json_objects = json_objects + [{
        'id': __id,
        'phase': phase.tolist(),
        'text': text,
        'time': time.tolist()
    } for __id, phase, text, time in zip(__ids, phases, texts, times)]
    
    with open(file_path, 'w') as json_file:
        json.dump(json_objects, json_file, indent=4)    

  pad_raw = (torch.tensor(max_shape) - torch.tensor(tensor_shape)).tolist()
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/747 [00:09<1:56:30,  9.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/747 [00:16<1:38:09,  7.91s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/747 [00:23<1:32:05,  7.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 4/747 [00:30<1:29:39,  7.24s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 5/747 [00:36<1:27:55,  7.11s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 6/747 [00:43<1:27:11,  7.06s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 7/747 [00:50<1:26:29,  7.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 8/747 [00:57<1:26:38

# Final output is saved to submit.json

In [4]:
import json
from typing import Dict
from typing_extensions import Tuple

with open('./predict.json', 'r') as rf:
    datas = json.load(rf)

mapping_phases = {'prerecognition': 0, 'recognition': 1, 'judgment': 2, 'action': 3, 'avoidance': 4, "0": 0, "1": 1, "2": 2, "3": 3, "4": 4}

def phase2token(phase: int):
    mapping_tokens = {i * 2 + j: (i, j) for i in range(5) for j in range(2)}
    mapping_objects = {0: 'pedestrian', 1: 'vehicle'}

    (label, i) = mapping_tokens[phase]

    return label, mapping_objects[i]

def get_correct_json(ids: Tuple, values: Dict):
    json_path = ids[-1]
    phase = ids[1]

    with open(json_path, 'r') as rf:
        phases = json.load(rf)['event_phase']

    for i in phases:
        if phase == int(i['labels'][0]):
            i.update(values)

            return ids[0], i

reformed_data = {}
final_data = {}

for data in datas:
    caption_path = data.pop('id')
    phase, _object = phase2token(data.pop('phase')[0][0])
    text = data.pop('text')

    # Get ID caption belongs to
    with open(caption_path, 'r') as rf:                                                                                                                                 
        json_data = json.load(rf)
        
        if 'video_name' in json_data: # External
            _id = (json_data['video_name'].split('.')[0], phase, caption_path)
        elif 'vehicle_view' in json_data: # Vehicle view
            _id = (os.path.splitext(json_data['vehicle_view'])[0], phase, caption_path)
        else:
            _id = (caption_path.split('/')[-3], phase, caption_path)

    # Update, Create result
    if _id not in reformed_data:
        reformed_data[_id] = {
            f'caption_{_object}': text
        }
    else:
        reformed_data[_id].update({
                                 f'caption_{_object}': text
                             })

for k, v in reformed_data.items():
    _id, value = get_correct_json(ids=k, values=v)

    if _id in final_data:
        final_data[_id].append(value)
    else:
        final_data[_id] = [value]

with open('submit.json', 'w') as wf:
    json.dump(final_data, wf, indent=4)

In [None]:
import json
from typing import Dict
from typing_extensions import Tuple

with open('./predict.json', 'r') as rf:
    datas = json.load(rf)

mapping_phases = {'prerecognition': 0, 'recognition': 1, 'judgment': 2, 'action': 3, 'avoidance': 4, "0": 0, "1": 1, "2": 2, "3": 3, "4": 4}

def phase2token(phase: int):
    mapping_tokens = {i * 2 + j: (i, j) for i in range(5) for j in range(2)}
    mapping_objects = {0: 'pedestrian', 1: 'vehicle'}

    (label, i) = mapping_tokens[phase]

    return label, mapping_objects[i]

def get_correct_json(ids: Tuple, values: Dict):
    json_path = ids[-1]
    phase = ids[1]

    with open(json_path, 'r') as rf:
        phases = json.load(rf)['event_phase']

    for i in phases:
        if phase == int(i['labels'][0]):
            i.update(values)

            return ids[0], i

reformed_data = {}
final_data = {}

for data in datas:
    caption_path = data.pop('id')
    phase, _object = phase2token(data.pop('phase')[0][0])
    text = data.pop('text')

    # Get ID caption belongs to
    with open(caption_path, 'r') as rf:                                                                                                                                 
        json_data = json.load(rf)
        
        if 'video_name' in json_data: # External
            _id = (json_data['video_name'].split('.')[0], phase, caption_path)
        elif 'vehicle_view' in json_data: # Vehicle view
            _id = (os.path.splitext(json_data['vehicle_view'])[0], phase, caption_path)
        else:
            _id = (caption_path.split('/')[-3], phase, caption_path)

    # Update, Create result
    if _id not in reformed_data:
        reformed_data[_id] = {
            f'caption_{_object}': text
        }
    else:
        reformed_data[_id].update({
                                 f'caption_{_object}': text
                             })

for k, v in reformed_data.items():
    _id, value = get_correct_json(ids=k, values=v)

    if _id in final_data:
        final_data[_id].append(value)
    else:
        final_data[_id] = [value]

with open('submit.json', 'w') as wf:
    json.dump(final_data, wf, indent=4)

import re
from typing import Optional, Tuple

def extract_main_text(raw_text: str) -> Optional[Tuple[float, float, str]]:
    """
    Extracts (start_time, end_time, main_text) from a structured caption input.

    Args:
        raw_text (str): Input caption line.

    Returns:
        Optional[Tuple[float, float, str]]: (start_time, end_time, main_text) or None if format invalid.
    """
    # Pattern to extract time and caption
    pattern = (
        r"time\s*\[\s*([\d.]+)\s*-\s*([\d.]+)\s*\]\s*:\s*event\s*:\s*(.*?)(?:<\|eos\|>|$)"
    )
    match = re.search(pattern, raw_text, re.IGNORECASE)
    
    if match:
        start_time = float(match.group(1))
        end_time = float(match.group(2))
        main_text = match.group(3)

        # Clean up main text: remove leading/trailing spaces and !
        main_text = main_text.strip("! \n\t")

        # Collapse repeated ! if needed
        main_text = re.sub(r"^!+|!+$", "", main_text).strip()

        return (start_time, end_time, main_text)
    
    return None


def clean_captions(data: dict) -> dict:
    """
    Given a dictionary containing video entries with vehicle and pedestrian captions,
    return a dictionary with cleaned captions.
    """
    cleaned_data = {}
    for video_id, entries in data.items():
        cleaned_entries = []
        for entry in entries:
            cleaned_entry = entry.copy()
            if 'caption_vehicle' in entry:
                cleaned_entry['start_vehicle'],cleaned_entry['end_vehicle'],cleaned_entry['caption_vehicle'] = extract_main_text(entry['caption_vehicle'])
            if 'caption_pedestrian' in entry:
                cleaned_entry['start_pedestrian'],cleaned_entry['end_pedestrian'],cleaned_entry['caption_pedestrian'] = extract_main_text(entry['caption_pedestrian'])
            cleaned_entries.append(cleaned_entry)
        cleaned_data[video_id] = cleaned_entries
    return cleaned_data


with open('submit.json') as rf:
    data = json.load(rf)

with open('submit_clean.json', 'w') as wf:
    json.dump(clean_captions(data), wf, indent=4)