In [None]:
import os

os.chdir(os.path.pardir)

In [None]:
import sys
sys.path.append("src")

from crash_detection.config import ConfigurationManager
from crash_detection.components.data.ingestion import DataIngestionComponent

config = ConfigurationManager()

data_ingestion_config = config.get_data_ingestion_config()
data_ingestion_artifact = DataIngestionComponent(data_ingestion_config)()
data_ingestion_artifact

In [None]:
model_training_config = config.model_training_config(data_ingestion_artifact=data_ingestion_artifact)
model_training_config

In [None]:
os.path.split("/hello/world/")

In [None]:
# Data Splitter 
from pandas import DataFrame
from crash_detection.core.config_entity import DataTransformationConfig
from crash_detection.core import Directory
from crash_detection.errors import TransformationError
from sklearn.model_selection import KFold, StratifiedKFold
from crash_detection.utils.common import save_csv
from pandas.api.types import is_integer_dtype
from sklearn.preprocessing import LabelEncoder
from typeguard import typechecked
from crash_detection.core.config_entity import DataSchema
from crash_detection import logger
import pandas as pd
import os


# @typechecked
def split_dataset(
    config: DataTransformationConfig,
    data: pd.DataFrame,
    schema: DataSchema,
    filename: str,
    outdir: Directory | None = None,
) -> DataFrame:
    split_config = config.split
    dataname = schema.name

    data["fold"] = -1
    _splits_ = ["kfold", "skfold"]

    labels = schema.target
    
    if isinstance(labels, list):
        if len(labels) > 1:
            logger.error("Multi Label Stratified K-Fold is not implemented yet.")
            split_config.type = "kfold"
        else:
            labels = labels[0]
            split_config.type = "skfold"

    if split_config.type == "kfold":
        splitter = KFold
    elif split_config.type == "skfold":
        splitter = StratifiedKFold

    if hasattr(splitter, "labels"):
        labels = split_config.labels

    splitter = splitter(
        split_config.n_splits,
        shuffle=os.environ.get("PYTHONHASHSEED", 0) != 0,
        random_state= int(os.environ["PYTHONHASHSEED"]) if hasattr(os.environ, "PYTHONHASHSEED") else None,
    )
    try:
        le_columns = []
        if labels:
            if isinstance(labels, list):
                for col in labels:
                    if not is_integer_dtype(data[col]):
                        logger.info(f"Label Encoding the dataset column '{col}'")
                        data[f"{col}_le"] = LabelEncoder().fit_transform(data[col])
                        col = f"{col}_le"
                    le_columns.append(col)

            elif isinstance(labels, str):
                le_columns = labels
                if not is_integer_dtype(data[labels]):
                    le_columns = f"{labels}_le"
                    data[le_columns] = LabelEncoder().fit_transform(data[labels])

            else:
                e = TransformationError(
                    "Labels are neither str or list[str]",
                    dataname=dataname,
                    file_name=filename,
                )
                logger.error(e)
                raise e

            logger.info(
                f"Folding '{dataname}.{filename}' into {split_config.n_splits} using {split_config.type} on column(s) {le_columns}"
            )

            for fold, (_, test_index) in enumerate(
                splitter.split(data, data[le_columns])
            ):
                data.loc[test_index, "fold"] = fold

            if isinstance(le_columns, str):
                le_columns = le_columns if le_columns.endswith("_le") else None
            elif isinstance(le_columns, list):
                le_columns = [col for col in le_columns if col.endswith("_le")]

            if le_columns:
                data = data.drop(le_columns, axis=1)

    except Exception as e:
        e = TransformationError(
            f"Labels are not given to use {split_config.type} folding technique. \nUsing Regular KFold",
            dataname=dataname,
            file_name=filename,
        )
        logger.error(e)

        if le_columns:
            if isinstance(le_columns, list):
                data = data.drop(
                    [col for col in le_columns if col.endswith("_le")], axis=1
                )
            elif le_columns.endswith("_le"):
                data = data.drop(le_columns, axis=1)

        splitter = KFold(
            split_config.n_splits,
            shuffle=os.environ.get("PYTHONHASHSEED", 0) != 0,
            random_state=int(os.environ.get("PYTHONHASHSEED", 1234)),
        )
        logger.info(
            f"Folding '{dataname}.{filename}' into {split_config.n_splits} using kfold"
        )

        for fold, (_, test_index) in enumerate(splitter.split(data)):
            data.loc[test_index, "fold"] = fold

    if outdir:
        save_csv(data, outdir // f"folded_{dataname}" / filename)

    return data

In [None]:
from crash_detection.core.config_entity import DataTransformationConfig, DataSchema
from crash_detection.core.artifact_entity import DataTransformationArtifact
from crash_detection.utils.common import load_csv, save_csv
from crash_detection import logger
from crash_detection.core import Directory
from pathlib import Path
import pandas as pd
from typeguard import typechecked
from tqdm import tqdm
from functools import partial   
from concurrent.futures import ThreadPoolExecutor
import cv2
import numpy as np

class DataTransformationComponent:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
    # @typechecked
    def _cache_video(self, vid : int, indir : Path, outdir : Directory) -> list[np.ndarray]:
        cache_file = outdir / f"{str(vid).zfill(5)}.npy"
        video_tensor = np.zeros((224, 224, 20), dtype=np.uint8)
        for i in range(20):
            path = indir / f"{str(vid).zfill(5)}_{str(i).zfill(2)}.jpg"

            image = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)
            image = cv2.resize(image, (224, 224))
            video_tensor[:, :, i] = image
        
        np.save(cache_file, video_tensor)

    def transform(self, data : pd.DataFrame, indir: Directory, outdir: Directory) -> bool:

        vid_list = data["vid"].unique().tolist()

        cache_func = partial(self._cache_video, indir=indir, outdir=outdir)

        with ThreadPoolExecutor(max_workers=4) as pool:
            list(
                tqdm(
                    pool.map(cache_func, vid_list[:5]),
                    desc="Parallel Cache Input",
                    total=len(vid_list),
                )
            )
        return True
    
    def __call__(self, datasets: dict[str, DataSchema]) -> None:
        data_path = self.config.indir
        if data_path.exists():
            train_datas = []
            valid_datas = []
            test_datas = []
        else:
            e = FileNotFoundError(f"Data path {data_path} does not exist.")
            logger.error(e)
            raise e

        for name, schema in datasets.items():
            for split in ['train', 'test']:
                for filename in eval(f"schema.{split}"):
                    data = load_csv(self.config.indir / name / filename)
                    data = data[['vid', schema.target]].drop_duplicates(ignore_index=True)
                    self.transform(
                        data, 
                        self.config.indir // name // getattr(schema, f"{split}_image_folder"), 
                        self.config.outdir // name // getattr(schema, f"{split}_image_folder")
                    )

                    data['filename'] = data.apply(lambda row: self.config.outdir / name / getattr(schema, f"{split}_image_folder") / f"{str(row['vid']).zfill(5)}.npy", axis=1)
                    

                    if split == 'train':
                        data = split_dataset(config=self.config, data=data, schema=schema, filename=filename, outdir=None)
                        train_folds = int(self.config.split.n_splits * self.config.split.ratio)

                        train_data = data[data['fold'] < train_folds].reset_index(drop=True)[['vid', schema.target, 'filename']]

                        save_csv(self.config.outdir / name / f"{split}.csv", data=data)

                        valid_data = data[data['fold'] >= train_folds].reset_index(drop=True)[['vid', schema.target, 'filename']]

                        train_datas.append(train_data)
                        if not valid_data.empty:
                            valid_datas.append(valid_data)
                            save_csv(self.config.outdir / name / "valid.csv", data=valid_data)
                    else:
                        test_datas.append(data)
                        save_csv(self.config.outdir / name / f"{split}.csv", data=data)

        train_data = pd.concat(train_datas).reset_index(drop=True)
        valid_data = pd.concat(valid_datas).reset_index(drop=True)
        test_data = pd.concat(test_datas).reset_index(drop=True)

        save_csv(self.config.outdir / "train.csv", data=train_data)
        save_csv(self.config.outdir / "valid.csv", data=valid_data)
        save_csv(self.config.outdir / "test.csv", data=test_data)
        
        return DataTransformationArtifact(
            path=self.config.outdir,
            train_file_path=self.config.outdir / "train.csv",
            valid_file_path=self.config.outdir / "valid.csv",
            test_file_path=self.config.outdir / "test.csv",
        )


for model, config in model_training_config.items():
    data_transformation_component = DataTransformationComponent(config=config.transforms)
    data_transformation_component(config.datasets)

In [None]:
os.environ["PYTHONHASHSEED"]