In [None]:
# Install spaCy (if not already installed)
!pip install spacy


In [None]:
# Install pandas and scikit-learn for data handling and evaluation
!pip install pandas scikit-learn

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from spacy.pipeline.textcat import Config, single_label_cnn_config, single_label_bow_config, single_label_default_config
from spacy.training import Example
from spacy.util import minibatch, compounding
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
import numpy as np
import random
import re

In [2]:
# Load train.csv
train_data = pd.read_csv("train.csv")

# Display the first few rows to understand the structure
train_data.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# Check for missing values and overall dataset structure
print("Missing values in the dataset:")
print(train_data.isnull().sum())
train_data.info()
train_data.describe()

Missing values in the dataset:
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
# Define input (comments) and output (labels)
X = train_data['comment_text']
y = train_data.drop(columns=['id', 'comment_text'])

# Inspect labels for imbalance
print("Label distribution:")
print(y.sum(axis=0))  # Check class counts per label

# Split the data (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

Label distribution:
toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64
Training samples: 127656, Validation samples: 31915


In [5]:
# Function to transform the data into spaCy format
def prepare_spacy_data(comments, labels):
    spacy_data = []
    for comment, label in zip(comments, labels.values):
        # Create a dictionary of labels with their binary values
        cats = {col: bool(value) for col, value in zip(labels.columns, label)}
        spacy_data.append((comment, {'cats': cats}))
    return spacy_data

# Prepare training and validation data
train_data_spacy = prepare_spacy_data(X_train, y_train)
val_data_spacy = prepare_spacy_data(X_val, y_val)

# Print a sample of the processed data
print("Sample processed data (spaCy format):")
print(train_data_spacy[:3])

Sample processed data (spaCy format):
[('Grandma Terri Should Burn in Trash \nGrandma Terri is trash. I hate Grandma Terri. F%%K her to HELL! 71.74.76.40', {'cats': {'toxic': True, 'severe_toxic': False, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), (', 9 May 2009 (UTC)\nIt would be easiest if you were to admit to being a member of the involved Portuguese Lodge, and then there would be no requirement to acknowledge whether you had a previous account (Carlos Botelho did not have a good record) or not and I would then remove the sockpuppet template as irrelevant. WP:COI permits people to edit those articles, such as MSJapan does, but just means you have to be more careful in ensuring that references back your edits and that NPOV is upheld.   20:29', {'cats': {'toxic': False, 'severe_toxic': False, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), ('"\n\nThe Objectivity of this Discussion is doubtful (non-existent)\n\n(1) As in

In [6]:
# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespaces
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text

# Apply preprocessing to training and validation data
X_train_cleaned = X_train.apply(preprocess_text)
X_val_cleaned = X_val.apply(preprocess_text)

# Prepare spaCy data again with cleaned text
train_data_spacy = prepare_spacy_data(X_train_cleaned, y_train)
val_data_spacy = prepare_spacy_data(X_val_cleaned, y_val)

print("Sample cleaned data (spaCy format):")
print(train_data_spacy[:3])

Sample cleaned data (spaCy format):
[('grandma terri should burn in trash grandma terri is trash i hate grandma terri fk her to hell 71747640', {'cats': {'toxic': True, 'severe_toxic': False, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), (' 9 may 2009 utc it would be easiest if you were to admit to being a member of the involved portuguese lodge and then there would be no requirement to acknowledge whether you had a previous account carlos botelho did not have a good record or not and i would then remove the sockpuppet template as irrelevant wpcoi permits people to edit those articles such as msjapan does but just means you have to be more careful in ensuring that references back your edits and that npov is upheld 2029', {'cats': {'toxic': False, 'severe_toxic': False, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), (' the objectivity of this discussion is doubtful nonexistent 1 as indicated earlier the section on marxist 

In [None]:
# Save processed data to files (optional)
# import json

# with open("train_data_spacy.json", "w") as f:
#     json.dump(train_data_spacy, f)

# with open("val_data_spacy.json", "w") as f:
#     json.dump(val_data_spacy, f)

# print("Preprocessed data saved as JSON files.")

In [7]:
# Feature engineering

# Combine the cleaned text and labels into a single DataFrame
train_data = pd.concat([X_train_cleaned, y_train], axis=1)

# Drop the 'severe_toxic' class
train_data = train_data.drop(columns=['severe_toxic'])

# Get the target sample size (equal to 'obscene')
target_samples = train_data[train_data['obscene'] == 1].shape[0]

def oversample_class(data, target_class):
  """
  Oversamples a specific class in the data to match the target size.

  Args:
      data: The original DataFrame.
      target_class: The class label to oversample (e.g., "threat", "identity_hate").

  Returns:
      The oversampled DataFrame for the target class.
  """
  class_data = data[data[target_class] == 1]
  return resample(
      class_data,
      replace=True,
      n_samples=target_samples,
      random_state=42
  )

# Oversample 'threat' and 'identity_hate' classes
oversampled_threat = oversample_class(train_data.copy(), "threat")
oversampled_identity_hate = oversample_class(train_data.copy(), "identity_hate")

# Filter out all rows except 'threat' and 'identity_hate' (efficiently)
non_oversampled_data = train_data[~train_data['threat'].isin([0, 1]) & ~train_data['identity_hate'].isin([0, 1])]

# Combine the oversampled classes with the rest of the dataset
train_data_balanced = pd.concat([
  non_oversampled_data,
  oversampled_threat,
  oversampled_identity_hate
])

# Shuffle the dataset
train_data_balanced = train_data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate back into features and labels
X_train_balanced = train_data_balanced[X_train_cleaned.name]
y_train_balanced = train_data_balanced.drop(columns=[X_train_cleaned.name])

# Verify the label distribution
print("Label distribution after selective oversampling:")
print(y_train_balanced.sum())

Label distribution after selective oversampling:
toxic            12569
obscene           9160
threat            7283
insult            9984
identity_hate     8229
dtype: int64


In [8]:
# Create a blank spaCy pipeline for English
nlp = spacy.blank("en")

# Print a confirmation
print("Blank spaCy pipeline created.")


Blank spaCy pipeline created.


In [9]:
config = {
    "threshold": 0.5,  # Classification threshold
    "model": {
        "@architectures": "spacy.TextCatEnsemble.v2",
        "tok2vec": {
            "@architectures": "spacy.Tok2Vec.v2",
            "embed": {
                "@architectures": "spacy.MultiHashEmbed.v2",
                "width": 64,
                "rows": [2000, 2000, 500, 1000, 500],
                "attrs": ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"],
                "include_static_vectors": False,
            },
            "encode": {
                "@architectures": "spacy.MaxoutWindowEncoder.v2",
                "width": 64,
                "window_size": 1,
                "maxout_pieces": 3,
                "depth": 2,
            },
        },
        "linear_model": {
            "@architectures": "spacy.TextCatBOW.v3",
            "exclusive_classes": False,  # Multi-label classification
            "ngram_size": 1,
            "no_output_layer": False,
            "length": 262144,  # Add length explicitly to avoid further errors
        },
    },
}

# Add the multi-label text categorizer
textcat = nlp.add_pipe("textcat_multilabel", config=config)

# Add labels (categories) to the text categorizer
for label in y_train_balanced.columns:  # Assuming y_train.columns contains category names
    textcat.add_label(label)

In [10]:
# Prepare training data
train_data = []
for text, labels in zip(X_train_balanced, y_train_balanced.values):
    annotations = {"cats": labels}  # 'cats' is the key for label annotations
    train_data.append((text, annotations))

print(f"Number of training samples: {len(train_data)}")

Number of training samples: 13468


In [11]:
# Prepare validation data (optional but recommended)
val_data = []
for text, labels in zip(X_val, y_val.values):
    annotations = {"cats": labels}
    val_data.append((text, annotations))

print(f"Number of validation samples: {len(val_data)}")

Number of validation samples: 31915


In [None]:
# Initialize optimizer
optimizer = nlp.begin_training()

# Training parameters
n_iter = 10  # Number of iterations

In [None]:
# Training loop
for epoch in range(n_iter):
    random.shuffle(train_data)  # Shuffle training data each epoch
    losses = {}
    
    # Create batches of data
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    
    for batch in batches:
        examples = []
        for text, annotations in batch:
            # Create Example objects
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, {"cats": annotations["cats"]})  # Multi-label format
            examples.append(example)
        
        # Update the model with the batch of Example objects
        nlp.update(examples, drop=0.5, losses=losses)
    
    print(f"Epoch {epoch + 1}, Loss: {losses['textcat_multilabel']}")

In [None]:
# Collect predictions and true labels
true_labels = []
pred_labels = []

for text, annotations in val_data:  # val_data is a list of (text, annotations)
    # Convert the text and annotations into an Example
    example = spacy.training.Example.from_dict(nlp.make_doc(text), {"cats": annotations["cats"]})
    
    # Process the text with the model
    doc = nlp(example.text)
    
    # Collect the predictions
    pred_labels.append({label: doc.cats[label] for label in doc.cats})
    
    # Collect the true labels
    true_labels.append(annotations["cats"])

# Convert predictions to binary based on threshold
threshold = 0.5
pred_binary = [
    {label: int(score >= threshold) for label, score in pred.items()}
    for pred in pred_labels
]

# Extract the keys (categories/labels) in a consistent order
categories = list(true_labels[0].keys())

# Convert dictionaries to 2D arrays for sklearn
true_array = np.array([[label_dict[cat] for cat in categories] for label_dict in true_labels])
pred_array = np.array([[label_dict[cat] for cat in categories] for label_dict in pred_binary])

# Evaluate using sklearn's classification report
print(classification_report(true_array, pred_array, target_names=categories))