In [1]:
import os

os.chdir("..")

In [2]:
%pwd

'/Users/morizin/Documents/Code/jigsaw-competition'

In [None]:
from pydantic import BaseModel
from src.jigsaw.entity.common import Directory
from src.jigsaw.entity.common import FilePath


class DataSplitParams(BaseModel):
    type: str
    nsplits: int = 5
    random_state: int = 2025


class DataTransformationConfig(BaseModel):
    outdir: Directory
    indir: Directory
    datasets: list[str]
    splitter: DataSplitParams
    features: dict[str, list[str]] | None
    wash: bool
    triplet: bool
    zero: bool
    matching: bool

In [None]:
from box import ConfigBox
from src.jigsaw.config.config import ConfigurationManager
from src.jigsaw.utils.common import load_yaml, load_json
from src.jigsaw.constants import *
from src.jigsaw.entity.common import FilePath, Directory
import os
from src.jigsaw import logger
# from src.jigsaw.entity.config_entity import DataSplitParams, DataTransformationConfig


class ConfigurationManager(ConfigurationManager):
    def get_data_transformation_config(self) -> DataTransformationConfig:
        split_config = self.params.splitter
        data_transform = self.config.data_transformation
        splitter = DataSplitParams(
            type=split_config.type,
            nsplits=split_config.nsplits,
            random_state=split_config.random_state,
        )
        status_file = load_json(
            self.artifact_root.path
            / os.path.join(self.config.data_validation.outdir, "status.json")
        )

        features = dict()
        names = []
        for name, schema in self.schema.items():
            names.append(name)
            if name in status_file:
                for value in status_file[name].values():
                    print(value)
                    if value["data_redundancy"]:
                        features[name] = schema.features

        return DataTransformationConfig(
            outdir=Directory(path=self.artifact_root.path / data_transform.outdir),
            indir=Directory(
                path=self.artifact_root.path / self.config.data_ingestion.outdir
            ),
            datasets= names,
            splitter=splitter,
            features=features,
            wash=data_transform.wash if hasattr(data_transform, "wash") else False,
            triplet=data_transform.triplet
            if hasattr(data_transform, "triplet")
            else False,
            zero=data_transform.zero if hasattr(data_transform, "zero") else False,
            matching=data_transform.matching
            if hasattr(data_transform, "matching")
            else False,
        )




In [57]:
from pydantic import validate_call
import pandas as pd
from src.jigsaw import logger
from src.jigsaw.entity.common import Directory
from src.jigsaw.entity.common import FilePath
from pathlib import Path
from glob import glob
from ensure import ensure_annotations
from cleantext import clean
from pandas.api.types import is_string_dtype
from src.jigsaw.utils.common import read_csv, save_json, save_csv, print_format
from collections import defaultdict
import warnings

warnings.filterwarnings('ignore')

class DataTransformationComponent:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

        self.outdir = self.config.outdir.path
        self.indir = self.config.indir.path

        self.names = []

        final_dir = ""

        length = 100
        print("=" * length)
        print_format("Datasets Available", length)
        print("=" * length)
        for name in self.config.datasets:
            if (self.outdir / name).is_dir():
                print_format(name, length)
                self.names.append(str(name))
        print("=" * length)

        print()

        print("=" * length)
        print_format("Datasets Generating", length)
        print("=" * length)
        if self.config.features:
            for name in self.names:
                final_dir = "cleaned_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print('='*length)
        
        if self.config.wash:
            for name in self.names:
                final_dir = "washed_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print('='*length)

        if self.config.triplet:
            for name in self.names:
                final_dir = "triplet_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print('='*length)

        if self.config.zero:
            for name in self.names:
                final_dir = "zero_shot_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print('='*length)

        if self.config.matching:
            for name in self.names:
                final_dir = "matching_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print('='*length)

        self.final_dir = final_dir

    @validate_call
    def __call__(self):
        
        for dataset in self.names:
            for file in (self.indir / dataset).iterdir():
                final_dir = dataset
                data = read_csv(file)
                if self.config.features:
                    final_dir = "cleaned_" + final_dir 
                    data = self.deduplication(data, file, name = final_dir)
                
                if self.config.wash:
                    final_dir = "washed_" + final_dir
                    data = self.clean_text(data, file, name = final_dir)

                if self.config.zero:
                    final_dir = "zero_" + final_dir
                    data = self.zero_shot_transform(data, file, name = final_dir)

    @ensure_annotations
    def deduplication(self, data: pd.DataFrame, path: Path | str,  name :str, save : bool = True) -> pd.DataFrame:
        filename = str(path).split("/")[-2:]
        
        if filename[-1] != 'sample_submission.csv':
            features = self.config.features[filename[0]]
            try:
                data.drop_duplicates(
                    subset=features,
                    ignore_index=True, 
                    inplace=True
                )
                logger.info(f"cleaning out duplicates: {'.'.join(filename)}")
            except Exception as e:
                logger.error(f"Failed cleaning out duplicates: {'.'.join(filename)}\nManual Cleaning")
                data.drop_duplicates(ignore_index=True, inplace=True)
        
        if save:
            target_dir = Directory(path=self.outdir/ name)
            save_csv(data, target_dir.path / filename[1])
        return data
    

    @ensure_annotations
    def clean_text(self, data: pd.DataFrame, path : Path | str, name : str, save : bool = True) -> pd.DataFrame:
        def clean_text(text):
            return clean(
                text,
                fix_unicode=True,
                to_ascii=True,
                lower=False,
                no_line_breaks=False,
                no_urls=True,
                no_emails=True,
                no_phone_numbers=True,
                no_numbers=False,
                no_digits=False,
                no_currency_symbols=False,
                no_punct=False,
                no_emoji=True,
                replace_with_url="<URL>",
                replace_with_phone_number="<PHONE>",
                replace_with_email="<EMAIL>"
            )
        
        filename = str(path).split("/")[-2:]
    
        if 'sample_submission.csv' not in filename:
            data['body'] = data['body'].apply(clean_text)
            for (key, dtype) in data.dtypes.items():
                if is_string_dtype(dtype):
                    data[key] = data[key].apply(clean_text)
            logger.info(f"Washed the file : {'.'.join(filename)}")
            self.deduplication(data, path, name = "", save = False)
        else:
            logger.warning(f"Couldn't clean text in {'.'.join(filename)}")
        
        if save:
            target_dir = Directory(path=self.outdir/ name)
            save_csv(data, target_dir.path / filename[1])
        return data
    
    @ensure_annotations
    def zero_shot_transform(self, data: pd.DataFrame, path : Path | str, name : str, save : bool = True) -> pd.DataFrame:
        filename = str(path).split("/")[-2:]
        try:
            if 'sample_submission.csv' not in filename:
                features = self.config.features[filename[0]]
                try:
                    zeroshot = [data[features + ['rule_violation']]]
                except KeyError:
                    zeroshot = []
                except Exception as e:
                    raise e
                
                for violation in ['positive', 'negative']:
                    for i in range(1, 3):
                        temp = data[features[:-1] + [f"{violation}_example_{i}"]]
                        temp ['rule_violation'] = 1 if violation == "positive" else 0
                        temp = temp.rename(columns= { f"{violation}_example_{i}": 'body'})
                        zeroshot.append(temp)

                zeroshot = pd.concat(zeroshot, axis=0)
                logger.info(f"Tranforming to Zero-Shot Dataset : {'.'.join(filename)}")
            else:
                zeroshot = data

        except Exception as e:
            logger.error(f"Error Tranforming to Zero-Shot Dataset : {'.'.join(filename)}")
            raise e

        if save:
            target_dir = Directory(path = self.outdir / name)
            save_csv(zeroshot, target_dir.path / filename[-1])
        return zeroshot

In [58]:
DataTransformationComponent(ConfigurationManager().get_data_transformation_config())()

2025-09-26 00:43:14,283 [INFO] : common : Loading YAML file : ./config/config.yaml
2025-09-26 00:43:14,288 [INFO] : common : Loading YAML file : ./params.yaml
2025-09-26 00:43:14,291 [INFO] : common : Loading YAML file : ./schema.yaml
2025-09-26 00:43:14,295 [INFO] : common : Loading Json file : artifacts/data/status.json
2025-09-26 00:43:14,295 [INFO] : common : Successfully loaded file: artifacts/data/status.json -> 1 master keys.
{'missing_values': False, 'mismatch_dtype': False, 'data_redundancy': True}
{'missing_values': False, 'mismatch_dtype': False, 'data_redundancy': False}
|                                        Datasets Available                                        |
|                                                raw                                                |

|                                        Datasets Generating                                        |
|                                    artifacts/data/cleaned_raw                                    |
|  

In [11]:
# from src.jigsaw.config.config import ConfigurationManager
# from src.jigsaw.components.data_transform import DataTranformationComponent


class DataTransformationPipeline:
    def __init__(
        self,
    ):
        self.config = ConfigurationManager().get_data_transformation_config()
        self.comp = DataTransformationComponent(self.config)

    def kickoff(
        self,
    ):
        self.comp()

In [12]:
DataTransformationPipeline().kickoff()

2025-09-25 23:58:56,302 [INFO] : common : Loading YAML file : ./config/config.yaml
2025-09-25 23:58:56,306 [INFO] : common : Loading YAML file : ./params.yaml
2025-09-25 23:58:56,308 [INFO] : common : Loading YAML file : ./schema.yaml
2025-09-25 23:58:56,310 [INFO] : common : Loading Json file : artifacts/data/status.json
2025-09-25 23:58:56,311 [INFO] : common : Successfully loaded file: artifacts/data/status.json -> 1 master keys.
{'missing_values': False, 'mismatch_dtype': False, 'data_redundancy': True}
{'missing_values': False, 'mismatch_dtype': False, 'data_redundancy': False}
|                                        Datasets Available                                        |
|                                                raw                                                |

|                                        Datasets Generating                                        |
|                                    artifacts/data/cleaned_raw                                    |
202