In [None]:
! pip install clean-text iterative-stratification
! pip install --no-deps --force-reinstall ../input/jigsaw/jigsaw-0.1.*-py3-none-any.whl

## Configs

In [None]:
%%writefile config.yaml
artifact_root: "/kaggle/working/inputs/"

data-ingestion:
  outdir: &data_ingestion "data/"
  raw:
    source: kaggle
    type: competition
    name: jigsaw-agile-community-rules

data-validation:
  outdir: "data/"
  statistics: false

data-transformation:
  outdir: "data/"
  indir: *data_ingestion
  splitter: false
  urlparse: false
  wash: false
  zero: true
  triplet: false
  #   ntriplets: 1
  #   nsamples: 5
    
  pairwise: false

model-training:
  outdir: "models/"
  indir: *data_ingestion
  fold: -1
  engine: "classifier"
  few-shot: false

In [None]:
%%writefile params.yaml
SEED: &seed 2345

splitter:
  type: kfold # groupkfold, smlkf, stratifiedkfold
  nsplits: 5
  random_state: *seed
  labels: 
    - rule
    - rule_violation

classifier:
  model-name: "/kaggle/input/notebook2cf3dd53b0/artifacts/models/checkpoint-108"
  nepochs: 1
  learning-rate: !!float 2e-4
  train-batch-size: 4
  gradient-accumulation-steps: 1
  weight-decay: 0.01
  warmup-ratio: 0.03
  tokenizer:
    max-length: 2048
    truncation: true
    padding: 'longest'

In [None]:
%%writefile schema.yaml
raw:
  train: ["train.csv"]
  test: ["test.csv"]
  columns:
    row_id: int64
    body: str
    rule: str
    subreddit: str
    positive_example_1: str
    positive_example_2: str
    negative_example_1: str
    negative_example_2: str
    rule_violation: int64
  
  features:
    - rule
    - body

  target: rule_violation

## Code

In [None]:
from jigsaw.config.config import ConfigurationManager
from jigsaw.components.data.ingestion import DataIngestionComponent
from jigsaw.components.data.validation import DataValidationComponent
from jigsaw.components.data.transformation import DataTransformationComponent

cfg = ConfigurationManager(
    config_path="config.yaml", params_path="params.yaml", schema_path="schema.yaml"
)

DataIngestionComponent(cfg.get_data_ingestion_config())()
DataValidationComponent(cfg.get_data_validation_config()).validate_all()
DataTransformationComponent(cfg.get_data_transformation_config())()