# Jigsaw - Agile Community Rules Classification

- Finetuning a Roberta-base transformer for binary classification to predict whether a Reddit comment violates a specific rule.

In [1]:
# Load the training dataset
import pandas as pd
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,row_id,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2,rule_violation
0,0,Banks don't want you to know this! Click here ...,"No Advertising: Spam, referral links, unsolici...",Futurology,If you could tell your younger self something ...,hunt for lady for jack off in neighbourhood ht...,Watch Golden Globe Awards 2017 Live Online in ...,"DOUBLE CEE x BANDS EPPS - ""BIRDS""\n\nDOWNLOAD/...",0
1,1,SD Stream [ ENG Link 1] (http://www.sportsstre...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,[I wanna kiss you all over! Stunning!](http://...,LOLGA.COM is One of the First Professional Onl...,#Rapper \n🚨Straight Outta Cross Keys SC 🚨YouTu...,[15 Amazing Hidden Features Of Google Search Y...,0
2,2,Lol. Try appealing the ban and say you won't d...,No legal advice: Do not offer or request legal...,pcmasterrace,Don't break up with him or call the cops. If ...,It'll be dismissed: https://en.wikipedia.org/w...,Where is there a site that still works where y...,Because this statement of his is true. It isn'...,1
3,3,she will come your home open her legs with an...,"No Advertising: Spam, referral links, unsolici...",sex,Selling Tyrande codes for 3€ to paypal. PM. \n...,tight pussy watch for your cock get her at thi...,NSFW(obviously) http://spankbang.com/iy3u/vide...,Good News ::Download WhatsApp 2.16.230 APK for...,1
4,4,code free tyrande --->>> [Imgur](http://i.imgu...,"No Advertising: Spam, referral links, unsolici...",hearthstone,wow!! amazing reminds me of the old days.Well...,seek for lady for sex in around http://p77.pl/...,must be watch movie https://sites.google.com/s...,We're streaming Pokemon Veitnamese Crystal RIG...,1


## Data Preprocessing
Clean the training dataset (**_train.csv_**), remove URLs, and normalize whitespace.

In [3]:
import re

def preprocess_text(text):
    text = str(text)
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Remove special chars except punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

for col in ['body', 'positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2']:
    train_df[col] = train_df[col].apply(preprocess_text)


## Train-Test Split using StratifiedKFold
- Use StratifiedKFold to split the data for robust evaluation.

In [4]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, val_idx = next(skf.split(train_df, train_df['rule_violation']))
train_data = train_df.iloc[train_idx]
val_data = train_df.iloc[val_idx]

## Finetune the **RoBERTa-base** Transformer 
- Finetune the RoBERTa-base transformer for binary classification of the `rule-violation` target variable.

In [5]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

ModuleNotFoundError: No module named 'transformers'

In [None]:
# Prepare dataset for HuggingFace Trainer
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['body'].tolist()
        self.labels = df['rule_violation'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
train_dataset = RedditDataset(train_data, tokenizer)
val_dataset = RedditDataset(val_data, tokenizer)

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities
    if logits.ndim == 2 and logits.shape[1] == 2:
        probs = np.exp(logits) if np.all(logits >= 0) and np.all(logits <= 1) else (np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True))
        probs = probs[:, 1]
    else:
        probs = logits if np.all(logits >= 0) and np.all(logits <= 1) else 1 / (1 + np.exp(-logits))
    auc = roc_auc_score(labels, probs)
    return {'auc': auc}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='no',
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=False,
    metric_for_best_model='auc',
    save_total_limit=2,
    report_to=[]
)

best_val_auc = 0
best_epoch = 0
for epoch in range(1, training_args.num_train_epochs + 1):
    print(f"Epoch {epoch}")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )
    trainer.train()
    # Training set AUC
    train_preds = trainer.predict(train_dataset)
    train_auc = train_preds.metrics.get('test_auc') or train_preds.metrics.get('auc')
    train_loss = train_preds.metrics.get('test_loss') or train_preds.metrics.get('loss')
    # Validation set AUC
    val_preds = trainer.predict(val_dataset)
    val_auc = val_preds.metrics.get('eval_auc') or val_preds.metrics.get('auc')
    val_loss = val_preds.metrics.get('eval_loss') or val_preds.metrics.get('loss')
    print(f"Train Loss: {train_loss}, Train AUC: {train_auc}")
    print(f"Val Loss: {val_loss}, Val AUC: {val_auc}")
    if val_auc and val_auc > best_val_auc:
        best_val_auc = val_auc
        best_epoch = epoch
        model.save_pretrained('model_weights')
        tokenizer.save_pretrained('model_weights')
print(f"Best Validation AUC: {best_val_auc} at epoch {best_epoch}")


## Save Finetuned Model Weights and Tokenizer
- Save model weights and tokenizer for inference.

In [None]:
model.save_pretrained('model_weights')
tokenizer.save_pretrained('model_weights')

## Load test.csv and sample_submission.csv

In [None]:
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
test_df.head()

## Preprocess test.csv

In [None]:
# Apply the same preprocessing as train.csv
for col in ['body', 'positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2']:
    test_df[col] = test_df[col].apply(preprocess_text)

## Inference with Saved Model and Tokenizer
- Load the finetuned model and tokenizer.
- Predict `rule_violation` for the test set.

In [None]:
# Inference the finetuned RoBERTa-base transformer model
from transformers import RobertaForSequenceClassification, RobertaTokenizer
model = RobertaForSequenceClassification.from_pretrained('model_weights')
tokenizer = RobertaTokenizer.from_pretrained('model_weights')
test_texts = test_df['body'].tolist()
inputs = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
model.eval()
with torch.no_grad():
    outputs = model(**{k: v for k, v in inputs.items()})
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()

## Format Predictions for Submission

In [None]:
# Format predictions to match sample_submission.csv
my_submission = pd.DataFrame({'row_id': test_df['row_id'], 'rule_violation': probs})
my_submission.head()

In [None]:
# Export my_submission to a CSV file
my_submission.to_csv('submission.csv', index=False)