In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Copy map-requirements to working directory
!cp -r /kaggle/input/jigsaw-requirement/ /kaggle/working/jigsaw-requirement/

In [None]:
# Make install_requirements.sh script executable
!chmod +x /kaggle/working/jigsaw-requirements/install_requirements.sh

In [None]:
# Run install_requirements.sh script
!/kaggle/working/jigsaw-requirements/install_requirements.sh

In [None]:
# Load train and test data
train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

In [None]:
# Preprocess text columns
def preprocessed_text(text):
    import re
    text = str(text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
for col in ['body', 'positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2']:
    train_df[col] = train_df[col].apply(preprocessed_text)
    test_df[col] = test_df[col].apply(preprocessed_text)

In [None]:
# Concatenate all text features for input
def concat_features(row):
    return f"{row['body']} [POS1] {row['positive_example_1']} [POS2] {row['positive_example_2']} [NEG1] {row['negative_example_1']} [NEG2] {row['negative_example_2']}"
train_df['text'] = train_df.apply(concat_features, axis=1)
test_df['text'] = test_df.apply(concat_features, axis=1)

In [None]:
# Prepare Sentence Transformers model
from sentence_transformers import SentenceTransformer, models  

# Initialize word_embedding model
model_name = '/kaggle/input/roberta-base-offline/transformers/default/1/roberta-base-offline'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
# Prepare training examples
from sentence_transformers import InputExample  
from torch.utils.data import DataLoader  
train_examples = [
    InputExample(texts=[row['text']], label=float(row['rule_violation']))
    for _, row in train_df.iterrows()
 ]
train_dataloader = DataLoader(train_examples, batch_size=16, shuffle=True)

In [None]:
# Use CosineSimilarityLoss for binary classification
from sentence_transformers import losses  
train_loss = losses.CosineSimilarityLoss(model)

In [None]:
# Finetune the model using Sentence Transformers 
import os
os.environ['WANDB_MODE'] = 'disabled' 
# os.environ['WANDB_DISABLED'] = 'true'   
os.environ['DISABLE_MLFLOW'] = 'true'  
os.environ['COMET_DISABLE'] = '1'      
os.environ['TENSORBOARD_DISABLE'] = 'true'  
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from torch.utils.data import DataLoader
model_name = 'roberta-base'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Prepare training examples for CosineSimilarityLoss 
train_examples = []
for _, row in train_df.iterrows():
    train_examples.append(InputExample(texts=[row['body'], row['positive_example_1']], label=1.0))
    train_examples.append(InputExample(texts=[row['body'], row['negative_example_1']], label=0.0))
    train_examples.append(InputExample(texts=[row['body'], row['positive_example_2']], label=1.0))
    train_examples.append(InputExample(texts=[row['body'], row['negative_example_2']], label=0.0))
train_dataloader = DataLoader(train_examples, batch_size=16, shuffle=True)
train_loss = losses.CosineSimilarityLoss(model)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100
)

In [None]:
# Save the finetuned model
model.save('sentence_roberta_model')

In [None]:
# Load the saved model for inference
model = SentenceTransformer('sentence_roberta_model')

In [None]:
# Inference on test set
test_embeddings = model.encode(test_df['text'].tolist(), batch_size=16, show_progress_bar=True)

In [None]:
# Use cosine similarity to a positive class prototype
positive_embeddings = model.encode(train_df[train_df['rule_violation'] == 1]['text'].tolist(), batch_size=16)

# Use the mean embedding of positive samples as the prototype
positive_prototype = torch.tensor(positive_embeddings).mean(dim=0)
test_scores = [torch.nn.functional.cosine_similarity(torch.tensor(embed), positive_prototype, dim=0).item() for embed in test_embeddings]

In [None]:
# Format predictions for submission
sample_submission = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv')
my_submission = pd.DataFrame({'row_id': test_df['row_id'], 'rule_violation': test_scores})
my_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission.head(10)