# Setup
## Library Imports and Data Loading and Cleaning

In [1]:
# Install necessary packages (run this cell once)
%pip install sentence-transformers nltk scikit-learn

# Import libraries
import os
import numpy as np
import pandas as pd
import re
import nltk
import torch
nltk.download('stopwords')  # Download stopwords for potential text cleaning
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucaszhuang1210gmail.com/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Define paths for training and test data
train_file = os.path.join('.', 'data', 'train.csv')
test_file = os.path.join('.', 'data', 'test.csv')

# Check if files exist
if not os.path.exists(train_file):
    raise FileNotFoundError(f"Training file not found: {train_file}.")
if not os.path.exists(test_file):
    raise FileNotFoundError(f"Test file not found: {test_file}.")

# Load the data
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

df_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
# Define a basic text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

# Clean the comments in both datasets
df_train['clean_comment'] = df_train['comment_text'].apply(clean_text)
df_test['clean_comment'] = df_test['comment_text'].apply(clean_text)

columns_to_show = ['id', 'comment_text', 'clean_comment', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df_train[columns_to_show]

Unnamed: 0,id,comment_text,clean_comment,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,daww he matches this background colour im seem...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",hey man im really not trying to edit war its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",more i cant make any real suggestions on impro...,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",and for the second time of asking when your vi...,0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,and it looks like it was actually you who put ...,0,0,0,0,0,0


# Encoding Comments with Sentence-BERT
NOTE: THIS STEP IS VERY SLOW, RECOMMEND TO RUN ON A FASTER MACHINE USING GPU


In [7]:
import torch

# Set device (GPU if available, otherwise CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Initialize the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Generate embeddings for training and test data
embeddings_train = model.encode(df_train['clean_comment'].tolist(), show_progress_bar=True)
embeddings_test = model.encode(df_test['clean_comment'].tolist(), show_progress_bar=True)

embeddings_train

Batches:   0%|          | 0/4987 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Saving Embeddings with NumPy

### For details, look over and use generate_embeddings.py

In this step, we use NumPy's `np.save` function to store the computed embeddings into a file (e.g., `embeddings.npy`). This saves the embeddings in a binary format optimized for numerical data, allowing for quickly load them on another machine without re-running the expensive encoding process.

In [None]:
# Save the embeddings to separate NumPy files
np.save('embeddings_train.npy', embeddings_train)
np.save('embeddings_test.npy', embeddings_test)

print("Embeddings saved to 'embeddings_train.npy' and 'embeddings_test.npy'")

In [5]:
# LOAD embeddings from the .npy file
embeddings_train = np.load('embeddings_train.npy', allow_pickle=True)
embeddings_test = np.load('embeddings_test.npy', allow_pickle=True)

embeddings_train

array([[-0.02739719,  0.05526469,  0.07293434, ...,  0.01976584,
        -0.00569206, -0.04199267],
       [-0.0923336 ,  0.06107998,  0.00989558, ...,  0.01220142,
        -0.05809944,  0.00749959],
       [ 0.00342565,  0.07124674,  0.03933695, ..., -0.04903377,
         0.02574941,  0.00680127],
       ...,
       [-0.04912953, -0.03788907, -0.10519164, ..., -0.05963376,
         0.1101158 ,  0.03413334],
       [-0.00096209,  0.02686265,  0.03119126, ...,  0.03642557,
         0.09237833,  0.09541053],
       [-0.03576253, -0.04440697,  0.02297527, ...,  0.05825022,
        -0.03229949,  0.01764056]], dtype=float32)

# Train and Run our model

In [6]:
# Define the target label columns (adjust these based on your dataset)
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Extract training labels
y_train = df_train[target_cols].values

# Train the classifier using OneVsRest logistic regression
classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000))
classifier.fit(embeddings_train, y_train)

# Predict on the training set to evaluate performance
y_train_pred = classifier.predict(embeddings_train)

print("Classification Report on Training Set:")
print(classification_report(y_train, y_train_pred, target_names=target_cols))


Classification Report on Training Set:
               precision    recall  f1-score   support

        toxic       0.84      0.63      0.72     15294
 severe_toxic       0.60      0.28      0.39      1595
      obscene       0.85      0.62      0.72      8449
       threat       0.70      0.19      0.30       478
       insult       0.78      0.56      0.65      7877
identity_hate       0.70      0.29      0.41      1405

    micro avg       0.82      0.58      0.67     35098
    macro avg       0.75      0.43      0.53     35098
 weighted avg       0.81      0.58      0.67     35098
  samples avg       0.06      0.05      0.05     35098



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Predict probabilities on the test set
# Note: For multi-label, predict_proba returns probability estimates for each class.
y_test_pred_prob = classifier.predict_proba(embeddings_test)

# Create output DataFrame with the required format
output_df = pd.DataFrame(y_test_pred_prob, columns=target_cols)
output_df.insert(0, 'id', df_test['id'])  # Insert 'id' as the first column

# Save the predictions to CSV
output_df.to_csv('sbert_predictions.csv', index=False)
print("Test set predictions saved to 'sbert_predictions.csv'")

Test set predictions saved to 'sbert_predictions.csv'
