# Data filter

In [None]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import logging as lg
import datetime as dt
import cv2 as cv
import tensorflow as tf

GLOBAL_TIMESTAMP = '{:%Y_%m_%d_%H_%M_%S}'.format(dt.datetime.now())

LINES_FILE_EXTENSIONS_FORMAT = ".lines.txt"

BASE_PATH = "../../"

OUTPUT_CSV_PATH = f"{BASE_PATH}dataset-description"

data_split_path = {"train": f"datasets/CuLane/train-validation",
                   "validation": f"datasets/CuLane/train-validation",
                   "test": f"datasets/CuLane/test"}

data_split_labels_path = {"train": f"dataset-description/train.txt",
                          "validation": f"dataset-description/val.txt",
                          "test": f"dataset-description/test.txt"}

TRAIN_LOSS = VALIDATION_LOSS = TEST_LOSS = 0

def data_frame_log(error_line, data_label):
    global GLOBAL_TIMESTAMP

    logging_path = f"{OUTPUT_CSV_PATH}/data-filter-logging/{data_label}_{GLOBAL_TIMESTAMP}_.log"

    lg.basicConfig(filename=f"{logging_path}", filemode="w", level=lg.INFO)

    lg.info(f"[{GLOBAL_TIMESTAMP}] | " + error_line)

def data_frames_obtain(train_frame, validation_frame, test_frame):
    train_frame = pd.read_csv(f"{BASE_PATH}{data_split_labels_path['train']}", sep=" ", header=None)

    validation_frame = pd.read_csv(f"{BASE_PATH}{data_split_labels_path['validation']}", sep=" ", header=None)

    test_frame = pd.read_csv(f"{BASE_PATH}{data_split_labels_path['test']}", sep=" ", header=None)

    return train_frame, validation_frame, test_frame

def data_frames_filter(data_base_path, data_frame, data_label):
    global TRAIN_LOSS, VALIDATION_LOSS, TEST_LOSS, BASE_PATH

    data_frame_copy = data_frame.copy()

    Path(f"{OUTPUT_CSV_PATH}/data-filter-logging").mkdir(parents=True, exist_ok=True)

    for ind, path_frame in tqdm(enumerate(data_frame_copy[0]), total=len(data_frame_copy[0]), desc=f"{data_label} data filtering", colour="GREEN"):
        path = Path(f"{BASE_PATH}{data_base_path}{path_frame}")

        path_lane = path.with_suffix(LINES_FILE_EXTENSIONS_FORMAT)

        file_id = path.stem

        id_record_flag = False

        line_file_flag = False

        if not path.exists():
            data_frame_log(f"Image file index: [{file_id}] | According to path [{path}] | Does not exist", data_label)

            id_record_flag = True
        
        elif not path_lane.exists():
            data_frame_log(f"Line file index: [{file_id}] | According to path [{path_lane}] | Does not exist", data_label)

            id_record_flag = True

            line_file_flag = True
        
        elif path_lane.stat().st_size == 0:
            data_frame_log(f"Line file index: [{file_id}] | According to path [{path_lane}] | Is empty", data_label)

            id_record_flag = True

            line_file_flag = True
        
        elif False == line_file_flag:
            with open(path_lane, 'r') as file:
                lines = [list(map(int, map(float, line.split()))) for line in file]
            
            if 2 > len(lines):
                data_frame_log(f"Line file index: [{file_id}] | According to path [{path_lane}] | Line quantity below minimum [2]", data_label)

                id_record_flag = True
        
        if id_record_flag:
            data_frame.drop(ind, inplace=True)

            if list(data_split_path.keys())[0] == data_label:
                TRAIN_LOSS += 1
            elif list(data_split_path.keys())[1] == data_label:
                VALIDATION_LOSS += 1
            else:
                TEST_LOSS += 1

    data_frame.reset_index(drop=True, inplace=True)

def data_process_mask(lines_file, image_file):
    mask_wdith, mask_height, _ = cv.imread(image_file).shape

    lines = []

    mask = np.zeros((mask_wdith, mask_height), dtype=np.uint8)

    line_id = lane_id = coords_first_id = coords_second_id = 0

    with open(lines_file, 'r') as file:
        lines = [list(map(int, map(float, line.split()))) for line in file]
    
    while line_id != len(lines) - 1:
        coords_first_id = coords_second_id = 0

        while coords_first_id + 4 <= len(lines[line_id]) or coords_second_id + 4 <= len(lines[line_id + 1]):
            x1, y1 = lines[line_id][coords_first_id], lines[line_id][coords_first_id + 1]
            x2, y2 = lines[line_id][coords_first_id + 2], lines[line_id][coords_first_id + 3]
            x3, y3 = lines[line_id + 1][coords_second_id], lines[line_id + 1][coords_second_id + 1]
            x4, y4 = lines[line_id + 1][coords_second_id + 2], lines[line_id + 1][coords_second_id + 3]

            cv.fillPoly(mask, [np.array([[x1, y1], [x2, y2], [x4, y4], [x3, y3]])], lane_id + 1, lineType = cv.LINE_AA)

            if coords_first_id + 4 < len(lines[line_id]):
                    coords_first_id += 2
            
            if coords_second_id + 4 < len(lines[line_id + 1]):
                    coords_second_id += 2

            if coords_first_id + 4 == len(lines[line_id]) and coords_second_id + 4 == len(lines[line_id + 1]):
                    x1, y1 = lines[line_id][coords_first_id], lines[line_id][coords_first_id + 1]
                    x2, y2 = lines[line_id][coords_first_id + 2], lines[line_id][coords_first_id + 3]
                    x3, y3 = lines[line_id + 1][coords_second_id], lines[line_id + 1][coords_second_id + 1]
                    x4, y4 = lines[line_id + 1][coords_second_id + 2], lines[line_id + 1][coords_second_id + 3]

                    cv.fillPoly(mask, [np.array([[x1, y1], [x2, y2], [x4, y4], [x3, y3]])], lane_id + 1, lineType = cv.LINE_AA)

                    break

        line_id += 1

        lane_id += 1

    return np.stack([mask] * 3, axis=-1), lane_id

def resize_and_pad_mask(mask, target_size=512, pad_value=255):
    mask = mask[:, :, 0]

    original_height, original_width = mask.shape

    scale = min(target_size / original_height, target_size / original_width)
    new_height = int(round(original_height * scale))
    new_width = int(round(original_width * scale))

    resized = cv.resize(mask, (new_width, new_height), interpolation=cv.INTER_NEAREST)

    pad_top = (target_size - new_height) // 2
    pad_bottom = target_size - new_height - pad_top
    pad_left = (target_size - new_width) // 2
    pad_right = target_size - new_width - pad_left

    padded = cv.copyMakeBorder(
        resized,
        pad_top, pad_bottom,
        pad_left, pad_right,
        borderType=cv.BORDER_CONSTANT,
        value=pad_value
    )

    return padded.astype(np.uint8)

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image_path, mask_tensor, lane_quantity):
    feature = {
        'image_path': _bytes_feature(image_path.encode('utf-8')),
        'mask_raw': _bytes_feature(mask_tensor.tobytes()),
        'height': _int64_feature(mask_tensor.shape[0]),
        'width': _int64_feature(mask_tensor.shape[1]),
        'lane_quantity': _int64_feature(lane_quantity)
    }

    record_proto = tf.train.Example(features=tf.train.Features(feature=feature))

    return record_proto.SerializeToString()

def tensors_formation(data_base_path, data_frame, data_label, shard_size=1000):
    tfrecord_path = f"{OUTPUT_CSV_PATH}/{data_label}.tfrecord"
    Path(tfrecord_path).parent.mkdir(parents=True, exist_ok=True)

    tfrecord_data_label_path = f"{OUTPUT_CSV_PATH}/{data_label}"
    Path(tfrecord_data_label_path).parent.mkdir(parents=True, exist_ok=True)

    tfrecord_dir = Path(f"{OUTPUT_CSV_PATH}/{data_label}")
    tfrecord_dir.mkdir(parents=True, exist_ok=True)

    options = tf.io.TFRecordOptions(compression_type="ZLIB")
    total_samples = len(data_frame)
    num_shards = (total_samples + shard_size - 1) // shard_size

    for shard_id in range(num_shards):
        start_idx = shard_id * shard_size
        end_idx = min((shard_id + 1) * shard_size, total_samples)

        shard_filename = tfrecord_dir / f"{data_label}_{shard_id:05d}-of-{num_shards:05d}.tfrecord"
        with tf.io.TFRecordWriter(str(shard_filename), options=options) as writer:
            for ind in tqdm(range(start_idx, end_idx), desc=f"{data_label} shard {shard_id+1}/{num_shards}", colour="BLUE"):
                try:
                    general_image_file_path = Path(data_base_path + data_frame[0][ind])
                    image_file_path = Path(f"{BASE_PATH}{general_image_file_path}")
                    line_file_path = Path(f"{BASE_PATH}{general_image_file_path.with_suffix(LINES_FILE_EXTENSIONS_FORMAT)}")

                    mask, lanes = data_process_mask(line_file_path, image_file_path)
                    mask = resize_and_pad_mask(mask)
                    serialized_example = serialize_example(str(general_image_file_path), mask, lanes)
                    writer.write(serialized_example)
                except Exception as e:
                    data_frame_log(f"Serialization error at index {ind}: {str(e)}", data_label)

train_frame = validation_frame = test_frame = None

train_frame, validation_frame, test_frame = data_frames_obtain(train_frame, validation_frame, test_frame)

data_frames_filter(data_split_path['train'], train_frame, list(data_split_path.keys())[0])

data_frames_filter(data_split_path['validation'], validation_frame, list(data_split_path.keys())[1])

data_frames_filter(data_split_path['test'], test_frame, list(data_split_path.keys())[2])

statistics_df = pd.DataFrame({"train_loss": TRAIN_LOSS, "validation_loss": VALIDATION_LOSS, "test_loss": TEST_LOSS})

statistics_df.to_csv(f"{OUTPUT_CSV_PATH}/loss_statistics.csv")

tensors_formation(data_split_path['train'], train_frame, list(data_split_path.keys())[0])

tensors_formation(data_split_path['validation'], validation_frame, list(data_split_path.keys())[1])

tensors_formation(data_split_path['test'], test_frame, list(data_split_path.keys())[2])