# Data filter

In [None]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import logging as lg
import datetime as dt
import cv2 as cv

GLOBAL_TIMESTAMP = '{:%Y_%m_%d_%H_%M_%S}'.format(dt.datetime.now())

LINES_FILE_EXTENSIONS_FORMAT = ".lines.txt"

BASE_PATH = "../../"

OUTPUT_CSV_PATH = f"{BASE_PATH}dataset-description"

data_split_path = {"train": f"datasets/CuLane/train-validation",
                   "validation": f"datasets/CuLane/train-validation",
                   "test": f"datasets/CuLane/test"}

data_split_labels_path = {"train": f"dataset-description/train.txt",
                          "validation": f"dataset-description/val.txt",
                          "test": f"dataset-description/test.txt"}

TRAIN_LOSS = VALIDATION_LOSS = TEST_LOSS = 0

def data_frame_log(error_line, data_label):
    global GLOBAL_TIMESTAMP

    logging_path = f"{OUTPUT_CSV_PATH}/data-filter-logging/{data_label}_{GLOBAL_TIMESTAMP}_.log"

    lg.basicConfig(filename=f"{logging_path}", filemode="w", level=lg.INFO)

    lg.info(f"[{GLOBAL_TIMESTAMP}] | " + error_line)

def data_frames_obtain(train_frame, validation_frame, test_frame):
    train_frame = pd.read_csv(f"{BASE_PATH}{data_split_labels_path['train']}", sep=" ", header=None)

    validation_frame = pd.read_csv(f"{BASE_PATH}{data_split_labels_path['validation']}", sep=" ", header=None)

    test_frame = pd.read_csv(f"{BASE_PATH}{data_split_labels_path['test']}", sep=" ", header=None)

    return train_frame, validation_frame, test_frame

def data_frames_filter(data_base_path, data_frame, data_label):
    global TRAIN_LOSS, VALIDATION_LOSS, TEST_LOSS, BASE_PATH

    data_frame_copy = data_frame.copy()

    Path(f"{OUTPUT_CSV_PATH}/data-filter-logging").mkdir(parents=True, exist_ok=True)

    for ind, path_frame in tqdm(enumerate(data_frame_copy[0]), total=len(data_frame_copy[0]), desc=f"{data_label} data filtering", colour="GREEN"):
        path = Path(f"{BASE_PATH}{data_base_path}{path_frame}")

        path_lane = path.with_suffix(LINES_FILE_EXTENSIONS_FORMAT)

        file_id = path.stem

        id_record_flag = False

        line_file_flag = False

        if not path.exists():
            data_frame_log(f"Image file index: [{file_id}] | According to path [{path}] | Does not exist", data_label)

            id_record_flag = True
        
        elif not path_lane.exists():
            data_frame_log(f"Line file index: [{file_id}] | According to path [{path_lane}] | Does not exist", data_label)

            id_record_flag = True

            line_file_flag = True
        
        elif path_lane.stat().st_size == 0:
            data_frame_log(f"Line file index: [{file_id}] | According to path [{path_lane}] | Is empty", data_label)

            id_record_flag = True

            line_file_flag = True
        
        elif False == line_file_flag:
            with open(path_lane, 'r') as file:
                lines = [list(map(int, map(float, line.split()))) for line in file]
            
            if 2 > len(lines):
                data_frame_log(f"Line file index: [{file_id}] | According to path [{path_lane}] | Line quantity below minimum [2]", data_label)

                id_record_flag = True
        
        if id_record_flag:
            data_frame.drop(ind, inplace=True)

            if list(data_split_path.keys())[0] == data_label:
                TRAIN_LOSS += 1
            elif list(data_split_path.keys())[1] == data_label:
                VALIDATION_LOSS += 1
            else:
                TEST_LOSS += 1

    data_frame.reset_index(drop=True, inplace=True)

def data_process_mask(lines_file, image_file):
    mask_wdith, mask_height, _ = cv.imread(image_file).shape

    lines = []

    mask = np.zeros((mask_wdith, mask_height), dtype=np.uint8)

    line_id = lane_id = coords_first_id = coords_second_id = 0

    with open(lines_file, 'r') as file:
        lines = [list(map(int, map(float, line.split()))) for line in file]
    
    while line_id != len(lines) - 1:
        coords_first_id = coords_second_id = 0

        while coords_first_id + 4 <= len(lines[line_id]) or coords_second_id + 4 <= len(lines[line_id + 1]):
            x1, y1 = lines[line_id][coords_first_id], lines[line_id][coords_first_id + 1]
            x2, y2 = lines[line_id][coords_first_id + 2], lines[line_id][coords_first_id + 3]
            x3, y3 = lines[line_id + 1][coords_second_id], lines[line_id + 1][coords_second_id + 1]
            x4, y4 = lines[line_id + 1][coords_second_id + 2], lines[line_id + 1][coords_second_id + 3]

            cv.fillPoly(mask, [np.array([[x1, y1], [x2, y2], [x4, y4], [x3, y3]])], lane_id + 1, lineType = cv.LINE_AA)

            if coords_first_id + 4 < len(lines[line_id]):
                    coords_first_id += 2
            
            if coords_second_id + 4 < len(lines[line_id + 1]):
                    coords_second_id += 2

            if coords_first_id + 4 == len(lines[line_id]) and coords_second_id + 4 == len(lines[line_id + 1]):
                    x1, y1 = lines[line_id][coords_first_id], lines[line_id][coords_first_id + 1]
                    x2, y2 = lines[line_id][coords_first_id + 2], lines[line_id][coords_first_id + 3]
                    x3, y3 = lines[line_id + 1][coords_second_id], lines[line_id + 1][coords_second_id + 1]
                    x4, y4 = lines[line_id + 1][coords_second_id + 2], lines[line_id + 1][coords_second_id + 3]

                    cv.fillPoly(mask, [np.array([[x1, y1], [x2, y2], [x4, y4], [x3, y3]])], lane_id + 1, lineType = cv.LINE_AA)

                    break

        line_id += 1

        lane_id += 1

    return np.stack([mask] * 3, axis=-1), lane_id

def tensors_formation(data_base_path, data_frame, data_label):
    df = pd.DataFrame(columns=["image_path", "tensor_path", "lane_quantity"])

    tensors_storage = f"{OUTPUT_CSV_PATH}/{data_label}"

    Path(f"{tensors_storage}").mkdir(parents=True, exist_ok=True)

    for ind, elem in tqdm(enumerate(data_frame[0]), total=len(data_frame[0]), desc=f"{data_label} tensor formation", colour="BLUE"):        
        general_image_file_path = Path(data_base_path + elem)

        image_file_path = Path(f"{BASE_PATH}{general_image_file_path}")

        line_file_path = Path(f"{BASE_PATH}{general_image_file_path.with_suffix(LINES_FILE_EXTENSIONS_FORMAT)}")

        mask, lanes = data_process_mask(line_file_path, image_file_path)

        mask_tensor = np.array(mask)

        row = pd.DataFrame({
            "image_path": general_image_file_path,
            "tensor_path": f"{elem}.npz",
            "lane_quantity": lanes
        }, index = [ind])

        Path(f"{tensors_storage}{elem}").parent.mkdir(parents = True, exist_ok = True)

        np.savez_compressed(f"{tensors_storage}{elem}", mask_tensor)

        df = pd.concat([df, row])
    
    df.to_csv(f"{OUTPUT_CSV_PATH}/{data_label}.csv", index=True)

train_frame = validation_frame = test_frame = None

train_frame, validation_frame, test_frame = data_frames_obtain(train_frame, validation_frame, test_frame)

data_frames_filter(data_split_path['train'], train_frame, list(data_split_path.keys())[0])

data_frames_filter(data_split_path['validation'], validation_frame, list(data_split_path.keys())[1])

data_frames_filter(data_split_path['test'], test_frame, list(data_split_path.keys())[2])

# statistics_df = pd.DataFrame({"train_loss": TRAIN_LOSS, "validation_loss": VALIDATION_LOSS, "test_loss": TEST_LOSS})

# statistics_df.to_csv(f"{OUTPUT_CSV_PATH}/loss_statistics.csv")

tensors_formation(data_split_path['train'], train_frame, list(data_split_path.keys())[0])

tensors_formation(data_split_path['validation'], validation_frame, list(data_split_path.keys())[1])

tensors_formation(data_split_path['test'], test_frame, list(data_split_path.keys())[2])