In [2]:
# Install necessary packages (run this cell once)
%pip install sentence-transformers nltk scikit-learn

# Import libraries
import os
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')  # Download stopwords for potential text cleaning
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# This cell installs and imports all required libraries, laying the foundation for our text processing and modeling.

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucaszhuang1210gmail.com/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Load the Kaggle Toxic Comment Challenge training data (ensure 'train.csv' is in your working directory)
data_file = os.path.join('.', 'data', 'train.csv')
# Check if the data file exists
if not os.path.exists(data_file):
    raise FileNotFoundError(f"Data file not found: {data_file}. Please ensure the file exists in the './data/' directory.")
else:
    print(f"Data file found at: {data_file}")

# Load the Kaggle Toxic Comment Challenge training data
df = pd.read_csv(data_file)
df

Data file found at: ./data/train.csv


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [7]:
# Define a basic text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase for uniformity
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply cleaning to the comment_text column
df['clean_comment'] = df['comment_text'].apply(clean_text)
df['clean_comment']

0         explanation why the edits made under my userna...
1         daww he matches this background colour im seem...
2         hey man im really not trying to edit war its j...
3         more i cant make any real suggestions on impro...
4         you sir are my hero any chance you remember wh...
                                ...                        
159566    and for the second time of asking when your vi...
159567    you should be ashamed of yourself that is a ho...
159568    spitzer umm theres no actual article for prost...
159569    and it looks like it was actually you who put ...
159570    and i really dont think you understand i came ...
Name: clean_comment, Length: 159571, dtype: object

In [None]:
# Initialize the Sentence-BERT model (using a lightweight pre-trained model)
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the cleaned comments into dense vector representations
embeddings = sbert_model.encode(df['clean_comment'].tolist(), show_progress_bar=True)

# This cell converts our preprocessed comments into numerical embeddings using Sentence-BERT, enabling the model to capture semantic meaning.

In [None]:
# Define the target label columns (adjust these based on your dataset)
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Prepare feature and label arrays
X = embeddings
y = df[target_cols].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a multi-label classifier using OneVsRest strategy with logistic regression
classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000))
classifier.fit(X_train, y_train)

# Make predictions on the validation set and display a classification report
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred, target_names=target_cols))

# This cell trains a basic multi-label classifier on the Sentence-BERT embeddings and evaluates its performance.