In [None]:
import os, sys

os.chdir("/Users/morizin/Documents/Code/jigsaw-competition")
sys.path.append("/Users/morizin/Documents/Code/jigsaw-competition/src")

In [None]:
from jigsaw.config import ConfigurationManager

cfg = ConfigurationManager()


In [None]:
from jigsaw.components.data import DataIngestionComponent, DataValidationComponent, DataTransformationComponent

data_ingestion_artifact = DataIngestionComponent(cfg.get_data_ingestion_config())()
data_validation_artifact = DataValidationComponent(cfg.get_data_validation_config(data_ingestion_artifact))()
data_transformation_artifact = DataTransformationComponent(cfg.get_data_transformation_config(data_validation_artifact))()

In [None]:
from jigsaw.components.data.augmentation import Augmentor
from jigsaw.utils.common import load_csv

augmentation_config = cfg.config.augmentations
augs = Augmentor(
    augments= augmentation_config.augments, 
    frac=augmentation_config.fraction, 
    resample=augmentation_config.n_resamples, 
    include_original=augmentation_config.include_original, 
    weight=augmentation_config.weight,
)
data = augs.augment(load_csv("artifact/11_11_2025_17_29_55/data/transformed/zero_shot_combined/train.csv"))
data

In [None]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer
)

from transformers import DataCollatorWithPadding

class JigsawDataset:
    def __init__(self, data, tokenizer):
        data["prompt"] = data["rule"] + "[SEP]" + data["body"]
        self.data = data
        self.encoding = tokenizer(data['prompt'], truncation=True, max_length=640)
        self.labels = self.data['rule_violation'].tolist() if "rule_violation" in self.data.columns else None

    def __len__(self) -> int:
        return len(self.encoding['input_ids'])
    
    def __getitem__(self, idx : int, ):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        
        return item


class ClassifierEngine:
    def __init__(self, data):
        
        tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
        train_dataset = JigsawDataset(data, tokenizer)
        collator = DataCollatorWithPadding(tokenizer=tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels = 2)
        training_args = TrainingArguments(
            output_dir=cfg.artifact_root.path,
            num_train_epochs=2,
            learning_rate=0.001,
            per_device_train_batch_size=32,
            warmup_ratio=0.1,
            weight_decay=0.01,
            report_to="none",
            save_strategy="no",  # 不保存中间 checkpoint
        )
        self.trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=collator,   # ★ 新增
        )
    def __call__(self):
        self.trainer.train()

In [None]:
from transformers import Trainer, AutoModelForCausalLM

In [None]:
import inspect

inspect.getsourcefile(AutoModelForCausalLM)

In [None]:
import os

os.chdir("..")

In [None]:
from src.jigsaw.utils.common import load_csv
import pandas as pd
import numpy as np

test_data = load_csv("artifacts/data/raw/test.csv")
test_data["rule_violation"] = np.random.random(test_data.shape[0])

test_data1 = test_data.copy()
test_data1["rule_violation"] = np.random.random(test_data1.shape[0])

test_data2 = test_data.copy()
test_data2["rule_violation"] = np.random.random(test_data2.shape[0])

test_data = pd.concat([test_data, test_data1, test_data2], axis=0).reset_index(
    drop=True
)
test_data = test_data.groupby("row_id")["rule_violation"].mean().reset_index()
test_data

In [None]:
from src.jigsaw.utils.common import load_pickle
import pandas as pd

data = pd.read_pickle("research/train.pkl")

In [None]:
import praw

reddit = praw.Reddit(
    client_id="my client id",
    client_secret="my client secret",
    user_agent="my user agent",
)

In [None]:
from src.jigsaw.config.config import ConfigurationManager

cfg = ConfigurationManager()

In [None]:
!sed -i 's/splitter:[[:space:]]*false/splitter: splitter/' ./config/config.yaml
# sed -i 's/train_batch_size:[[:space:]]*3/train_batch_size: 8/' accelerate_config.yaml

In [None]:
cfg.get_data_transformation_config()

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy.random as np_random
import random
import pandas as pd
import numpy as np
import os

seed = 1234
random.seed(seed + 1)
np_random.seed(seed + 1)
os.environ["PYTHONHASHSEED"] = str(seed + 1)
# X = list(map(int, list("0123456789")))
# y = [np_random.randint(0, 3, 10), np_random.randint(0, 2, 10)]
# X, y
X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([([2, 1, 0, 0, 0, 1, 1, 1, 2, 2]), ([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])])

data = pd.DataFrame(
    {
        "X": X,
        "y0": y[0],
        "y1": y[1],
    }
)

In [None]:
data.groupby("y0")["y1"].value_counts()

In [None]:
list(
    MultilabelStratifiedKFold(shuffle=True, random_state=seed).split(
        data, data[["y0", "y1"]]
    )
)

In [None]:
[
    (array([0, 1, 4, 6, 8, 9]), array([2, 3, 5, 7])),
    (array([1, 2, 3, 5, 6, 7, 9]), array([0, 4, 8])),
    (array([0, 2, 3, 4, 5, 7, 8]), array([1, 6, 9])),
]

In [None]:
for i in range(11, 4):
    print(i)
else:
    print("j")

In [None]:
!pwd

In [None]:
import os

os.chdir("..")
from src.jigsaw.utils.common import load_csv

data = load_csv(
    "/Users/morizin/Documents/Code/jigsaw-competition/research/submission (2).csv"
)

In [None]:
data.query("rule_violation >= 0.5")["rule_violation"] ** (1 / 2)

In [None]:
import os

os.chdir("..")

In [None]:
from src.jigsaw.utils.common import load_csv

data = load_csv("research/submission_aug-3.csv")

In [None]:
data

In [None]:
(0.26 + 0.80) / 2