In [None]:
import os

os.chdir("..")

In [None]:
%pwd

In [None]:
from pydantic import validate_call
import pandas as pd
from src.jigsaw import logger
from src.jigsaw.entity.common import Directory
from src.jigsaw.entity.config_entity import DataTransformationConfig, DataSplitParams

from src.jigsaw.components.data.cleaning import remove_duplicates, clean_text
from src.jigsaw.components.data.zeroshot import zero_shot_transform
from src.jigsaw.components.data.folding import split_dataset
from pathlib import Path
from ensure import ensure_annotations
from cleantext import clean
from pandas.api.types import is_string_dtype
from src.jigsaw.utils.common import read_csv, save_csv, print_format
import warnings

warnings.filterwarnings("ignore")


class DataTransformationComponent:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

        self.outdir = self.config.outdir.path
        self.indir = self.config.indir.path

        self.names = []
        self.pipeline = []

        final_dir = ""

        length = 100
        print("=" * length)
        print_format("Datasets Available", length)
        print("=" * length)
        for name in self.config.datasets:
            if (self.outdir / name).is_dir():
                print_format(name, length)
                self.names.append(str(name))
        print("=" * length)

        print()

        print("=" * length)
        print_format("Datasets Generating", length)
        print("=" * length)
        if self.config.features:
            for name in self.names:
                final_dir = "cleaned_" + final_dir
                self.pipeline.append((final_dir, remove_duplicates))
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.wash:
            for name in self.names:
                final_dir = "washed_" + final_dir
                self.pipeline.append((final_dir, clean_text))
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.zero:
            for name in self.names:
                final_dir = "zero_" + final_dir
                self.pipeline.append((final_dir, zero_shot_transform))
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.triplet:
            for name in self.names:
                final_dir = "triplet_" + final_dir
                self.pipeline.append((final_dir, list))
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.pairwise:
            for name in self.names:
                final_dir = "pairwise_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.splitter:
            for name in self.names:
                final_dir = "folded_" + final_dir
                self.pipeline.append((final_dir, split_dataset))
                print_format(self.indir / f"{final_dir}{name}/", length)

            print("=" * length)

        self.final_dir = final_dir

    @validate_call
    def __call__(self):
        for name in self.names:
            for path in (self.indir / name).iterdir():
                data = read_csv(path)
                path = str(path).split("/")[-2:]
                for (dirname, process) in self.pipeline:
                    data = process(
                        config= self.config,
                        data = data,
                        path = path,
                        name = dirname + name,
                        outdir = self.outdir
                    )

In [None]:
import pandas as pd
import os
os.chdir("/Users/morizin/Documents/Code/jigsaw-competition")

data = pd.read_csv("artifacts/data/zero_washed_cleaned_raw/train.csv")

((_, neg), (_, pos)) = data.groupby(by = 'rule_violation')
pos = pos.reset_index(drop = True)
neg = neg.reset_index(drop = True)


In [None]:
class TripletDataConfig(BaseModel):
    ntriplets : int
    nsamples  : int
    random_state : int

In [None]:
from src.jigsaw.entity.config_entity import DataTransformationConfig
from src.jigsaw.entity.common import Directory
from src.jigsaw.utils.common import save_csv
from pandas.core.frame import DataFrame
from ensure import ensure_annotations
from tqdm.autonotebook import tqdm
from src.jigsaw import logger
import numpy as np
from pathlib import Path
import random

@ensure_annotations
def triplet_dataset(config: DataTransformationComponent,
                    data: DataFrame, 
                    path : list,
                    name : str,
                    outdir: str | Path | None = None) -> DataFrame:

    dataname, filename = path
    triplet_config = config.triplet

    if filename == 'sample_submission.csv':
        return data
    
    if set(data.columns) != set(config.features + [config.targets]):
        return data
    
    if data.loc[0, 'rule_violation'] == 0:
        ((_, neg), (_, pos)) = data.groupby('rule_violation')
    else:
        ((_, pos), (_, neg)) = data.groupby("rule_violation")

    pos = pos.reset_index(drop = True)
    neg = neg.reset_index(drop = True)

    rules = neg.groupby('rule').apply(
        lambda x: list(
            x.sample(
                len(x)
            ).index
        )
    ).to_dict()

    sr = neg.groupby(['rule', 'subreddit']).apply(
        lambda x: list(
            x.sample(
                len(x)
            ).index
        )
    ).to_dict()

    pos_repeat = pd.concat([pos] * triplet_config.ntriplets, axis = 0)
    negatives = []

    logger.info(f"Generating {triplet_config.nsamples} samples of {triplet_config.ntriplet} triplets each in the file {dataname}.{filename}")
    for idx, positive in tqdm(pos_repeat.iterrows(), total = len(pos_repeat)):
        subred = sr.get((positive.rule, positive.subreddit), None)
        chosen_idx = []
        remaining = triplet_config.nsample
        if subred:
            idx = min(len(subred), remaining)
            chosen_idx.extend(subred[:idx])
            sr[(positive.rule, positive.subreddit)] = subred[idx:]
            remaining -= idx

        if remaining:
            rule = rules[positive.rule]
            idx = min(remaining, len(rule))
            chosen_idx.extend(rule[:idx]) 
            rules[positive.rule] = rule[idx:]
            remaining -= idx
            
        while remaining > 0:
            rules = neg.groupby('rule').apply(
                lambda x: list(
                    x.sample(
                        len(x)
                    ).index
                )
            ).to_dict()
            rule = rules[positive.rule]
            idx = min(remaining, len(rule))
            chosen_idx.extend(rule[:idx]) 
            rules[positive.rule] = rule[idx:]
            remaining -= idx

        negatives.append(chosen_idx)

    negatives = pd.DataFrame([neg.loc[idx, 'body'].values for idx in negatives], columns = [f"negative_{i}" for i in range(len(negatives[0]))], index = range(len(negatives)))

    assert negatives.shape == (pos.shape[0] * triplet_config.ntriplets, triplet_config.nsample), logger.error(f"Error when generating triplets '{dataname}.{filename}', shape doesn't match {negatives.shape} != ({pos.shape[0] * triplet_config.ntriplets}, {triplet_config.nsamples}")

    logger.info(f"Generating {triplet_config.nsamples} samples of {triplet_config.ntriplet} triplets each in the file {dataname}.{filename}")

    pos_repeat = pd.merge(pos_repeat, negatives)

    if outdir:
        target_dir = Directory(path = (Path(outdir) if isinstance(outdir, str) else outdir) / name)
        save_csv(data, target_dir.path / filename)
    return data

In [None]:
import os

os.chdir("/Users/morizin/Documents/Code/jigsaw-competition")

from src.jigsaw.config.config import ConfigurationManager
from src.jigsaw.components.data.transformation import DataTransformationComponent

cfg = ConfigurationManager()
DataTransformationComponent(cfg.get_data_transformation_config())()

In [None]:
from src.jigsaw.config.config import ConfigurationManager
# from src.jigsaw.components.data_transform import DataTranformationComponent


class DataTransformationPipeline:
    def __init__(
        self,
    ):
        self.config = ConfigurationManager().get_data_transformation_config()
        self.comp = DataTransformationComponent(self.config)

    def kickoff(
        self,
    ):
        self.comp()

In [None]:
DataTransformationPipeline().kickoff()

In [None]:
from pydantic import validate_call
import pandas as pd
from src.jigsaw import logger
from src.jigsaw.entity.common import Directory
from src.jigsaw.entity.config_entity import DataTransformationConfig, DataSplitParams

from src.jigsaw.components.dataset.cleaning import remove_duplicates
from pathlib import Path
from ensure import ensure_annotations
from cleantext import clean
from pandas.api.types import is_string_dtype
from src.jigsaw.utils.common import read_csv, save_csv, print_format
import warnings

warnings.filterwarnings("ignore")


class DataTransformationComponent:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

        self.outdir = self.config.outdir.path
        self.indir = self.config.indir.path

        self.names = []

        final_dir = ""

        length = 100
        print("=" * length)
        print_format("Datasets Available", length)
        print("=" * length)
        for name in self.config.datasets:
            if (self.outdir / name).is_dir():
                print_format(name, length)
                self.names.append(str(name))
        print("=" * length)

        print()

        print("=" * length)
        print_format("Datasets Generating", length)
        print("=" * length)
        if self.config.features:
            for name in self.names:
                final_dir = "cleaned_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.wash:
            for name in self.names:
                final_dir = "washed_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.triplet:
            for name in self.names:
                final_dir = "triplet_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.zero:
            for name in self.names:
                final_dir = "zero_shot_" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        if self.config.pairwise:
            for name in self.names:
                final_dir = "pairwise" + final_dir
                print_format(self.indir / f"{final_dir}{name}/", length)
            print("=" * length)

        self.final_dir = final_dir

    @validate_call
    def __call__(self):
        for dataset in self.names:
            for file in (self.indir / dataset).iterdir():
                final_dir = dataset
                data = read_csv(file)
                file = str(file).split("/")[-2:]
                if self.config.features:
                    final_dir = "cleaned_" + final_dir
                    data = remove_duplicates(
                        self.config, data, file, final_dir, self.outdir
                    )

                if self.config.wash:
                    final_dir = "washed_" + final_dir
                    data = self.clean_text(data, file, name=final_dir)

                if self.config.zero:
                    final_dir = "zero_" + final_dir
                    data = self.zero_shot_transform(data, file, name=final_dir)

    @ensure_annotations
    def deduplication(
        self, data: pd.DataFrame, path: Path | str, name: str, save: bool = True
    ) -> pd.DataFrame:
        filename = str(path).split("/")[-2:]
        print(path, filename)
        if filename[-1] != "sample_submission.csv":
            features = self.config.features[filename[0]]
            try:
                data.drop_duplicates(subset=features, ignore_index=True, inplace=True)
                logger.info(f"cleaning out duplicates: {'.'.join(filename)}")
            except Exception as e:
                logger.error(
                    f"Failed cleaning out duplicates: {'.'.join(filename)}\nManual Cleaning"
                )
                data.drop_duplicates(ignore_index=True, inplace=True)

        if save:
            target_dir = Directory(path=self.outdir / name)
            save_csv(data, target_dir.path / filename[1])
        return data

    @ensure_annotations
    def clean_text(
        self, data: pd.DataFrame, path: Path | str, name: str, save: bool = True
    ) -> pd.DataFrame:
        def clean_text(text):
            return clean(
                text,
                fix_unicode=True,
                to_ascii=True,
                lower=False,
                no_line_breaks=False,
                no_urls=True,
                no_emails=True,
                no_phone_numbers=True,
                no_numbers=False,
                no_digits=False,
                no_currency_symbols=False,
                no_punct=False,
                no_emoji=True,
                replace_with_url="<URL>",
                replace_with_phone_number="<PHONE>",
                replace_with_email="<EMAIL>",
            )

        filename = str(path).split("/")[-2:]

        if "sample_submission.csv" not in filename:
            data["body"] = data["body"].apply(clean_text)
            for key, dtype in data.dtypes.items():
                if is_string_dtype(dtype):
                    data[key] = data[key].apply(clean_text)
            logger.info(f"Washed the file : {'.'.join(filename)}")
            self.deduplication(data, path, name="", save=False)
        else:
            logger.warning(f"Couldn't clean text in {'.'.join(filename)}")

        if save:
            target_dir = Directory(path=self.outdir / name)
            save_csv(data, target_dir.path / filename[1])
        return data

    @ensure_annotations
    def zero_shot_transform(
        self, data: pd.DataFrame, path: Path | str, name: str, save: bool = True
    ) -> pd.DataFrame:
        filename = str(path).split("/")[-2:]
        try:
            if "sample_submission.csv" not in filename:
                features = self.config.features[filename[0]]

                try:
                    zeroshot = [data[features + ["rule_violation"]]]
                except KeyError:
                    zeroshot = []
                except Exception as e:
                    raise e

                for violation in ["positive", "negative"]:
                    for i in range(1, 3):
                        temp = data[features[:-1] + [f"{violation}_example_{i}"]]
                        temp["rule_violation"] = 1 if violation == "positive" else 0
                        temp = temp.rename(columns={f"{violation}_example_{i}": "body"})
                        zeroshot.append(temp)

                zeroshot = pd.concat(zeroshot, axis=0)
                logger.info(f"Tranforming to Zero-Shot Dataset : {'.'.join(filename)}")
            else:
                zeroshot = data

        except Exception as e:
            logger.error(
                f"Error Tranforming to Zero-Shot Dataset : {'.'.join(filename)}"
            )
            raise e

        if save:
            target_dir = Directory(path=self.outdir / name)
            save_csv(zeroshot, target_dir.path / filename[-1])
        return zeroshot

    # @ensure_annotations
    # def split_dataset(self, data: pd.DataFrame, )