In [None]:
# Install spaCy (if not already installed)
!pip install spacy

In [None]:
# Install pandas and scikit-learn for data handling and evaluation
!pip install pandas scikit-learn

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from spacy.pipeline.textcat import Config, single_label_cnn_config, single_label_bow_config, single_label_default_config
from spacy.training import Example
from spacy.util import minibatch, compounding
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
import numpy as np
import random
import re

In [2]:
# Load train.csv
train_data = pd.read_csv("train.csv")

# Display the first few rows to understand the structure
train_data.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# Check for missing values and overall dataset structure
# print("Missing values in the dataset:")
# print(train_data.isnull().sum())
# train_data.info()
# train_data.describe()

In [4]:
# Drop the 'severe_toxic' class because of redunancy with toxic
train_data = train_data.drop(columns=['severe_toxic'])
train_data.head()

Unnamed: 0,id,comment_text,toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0


In [5]:
# Define input (comments) and output (labels)
X = train_data['comment_text']
y = train_data.drop(columns=['id', 'comment_text'])

# Inspect labels for imbalance
print("Label distribution:")
print(y.sum(axis=0))  # Check class counts per label

# Split the data (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

Label distribution:
toxic            15294
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64
Training samples: 127656, Validation samples: 31915


In [6]:
# Function to transform the data into spaCy format
def prepare_spacy_data(comments, labels):
    spacy_data = []
    for comment, label in zip(comments, labels.values):
        # Create a dictionary of labels with their binary values
        cats = {col: bool(value) for col, value in zip(labels.columns, label)}
        spacy_data.append((comment, {'cats': cats}))
    return spacy_data

# Prepare training and validation data
train_data_spacy = prepare_spacy_data(X_train, y_train)
val_data_spacy = prepare_spacy_data(X_val, y_val)

# Print a sample of the processed data
print("Sample processed data (spaCy format):")
print(train_data_spacy[:3])

Sample processed data (spaCy format):
[('Grandma Terri Should Burn in Trash \nGrandma Terri is trash. I hate Grandma Terri. F%%K her to HELL! 71.74.76.40', {'cats': {'toxic': True, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), (', 9 May 2009 (UTC)\nIt would be easiest if you were to admit to being a member of the involved Portuguese Lodge, and then there would be no requirement to acknowledge whether you had a previous account (Carlos Botelho did not have a good record) or not and I would then remove the sockpuppet template as irrelevant. WP:COI permits people to edit those articles, such as MSJapan does, but just means you have to be more careful in ensuring that references back your edits and that NPOV is upheld.   20:29', {'cats': {'toxic': False, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), ('"\n\nThe Objectivity of this Discussion is doubtful (non-existent)\n\n(1) As indicated earlier, the section on Marxist leader

In [7]:
# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespaces
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text

# Apply preprocessing to training and validation data
X_train_cleaned = X_train.apply(preprocess_text)
X_val_cleaned = X_val.apply(preprocess_text)

# Prepare spaCy data again with cleaned text
train_data_spacy = prepare_spacy_data(X_train_cleaned, y_train)
val_data_spacy = prepare_spacy_data(X_val_cleaned, y_val)

print("Sample cleaned data (spaCy format):")
print(train_data_spacy[:3])

Sample cleaned data (spaCy format):
[('grandma terri should burn in trash grandma terri is trash i hate grandma terri fk her to hell 71747640', {'cats': {'toxic': True, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), (' 9 may 2009 utc it would be easiest if you were to admit to being a member of the involved portuguese lodge and then there would be no requirement to acknowledge whether you had a previous account carlos botelho did not have a good record or not and i would then remove the sockpuppet template as irrelevant wpcoi permits people to edit those articles such as msjapan does but just means you have to be more careful in ensuring that references back your edits and that npov is upheld 2029', {'cats': {'toxic': False, 'obscene': False, 'threat': False, 'insult': False, 'identity_hate': False}}), (' the objectivity of this discussion is doubtful nonexistent 1 as indicated earlier the section on marxist leaders views is misleading a it lays unwarran

In [None]:
# Save processed data to files (optional)
# import json

# with open("train_data_spacy.json", "w") as f:
#     json.dump(train_data_spacy, f)

# with open("val_data_spacy.json", "w") as f:
#     json.dump(val_data_spacy, f)

# print("Preprocessed data saved as JSON files.")

In [8]:
# Create a blank spaCy pipeline for English
nlp = spacy.blank("en")

# Print a confirmation
print("Blank spaCy pipeline created.")


Blank spaCy pipeline created.


In [9]:
config = {
    "threshold": 0.5,  # Classification threshold
    "model": {
        "@architectures": "spacy.TextCatEnsemble.v2",
        "tok2vec": {
            "@architectures": "spacy.Tok2Vec.v2",
            "embed": {
                "@architectures": "spacy.MultiHashEmbed.v2",
                "width": 64,
                "rows": [2000, 2000, 500, 1000, 500],
                "attrs": ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"],
                "include_static_vectors": False,
            },
            "encode": {
                "@architectures": "spacy.MaxoutWindowEncoder.v2",
                "width": 64,
                "window_size": 1,
                "maxout_pieces": 3,
                "depth": 2,
            },
        },
        "linear_model": {
            "@architectures": "spacy.TextCatBOW.v3",
            "exclusive_classes": False,  # Multi-label classification
            "ngram_size": 1,
            "no_output_layer": False,
            "length": 262144,  # Add length explicitly to avoid further errors
        },
    },
}

# Add the multi-label text categorizer (textcat_multilabel)
textcat = nlp.add_pipe("textcat_multilabel", config=config)

analysis = nlp.analyze_pipes(pretty=True)
print(analysis)

# Add labels (categories) to the text categorizer
for label in y_train.columns:  # Assuming y_train.columns contains category names
    textcat.add_label(label)

[1m

#   Component            Assigns    Requires   Scores            Retokenizes
-   ------------------   --------   --------   ---------------   -----------
0   textcat_multilabel   doc.cats              cats_score        False      
                                               cats_score_desc              
                                               cats_micro_p                 
                                               cats_micro_r                 
                                               cats_micro_f                 
                                               cats_macro_p                 
                                               cats_macro_r                 
                                               cats_macro_f                 
                                               cats_macro_auc               
                                               cats_f_per_type              

[38;5;2m✔ No problems found.[0m
{'summary': {'textcat_multilabel': 

In [10]:
# Initialize optimizer
optimizer = nlp.begin_training()

# Training parameters
n_iter = 10  # Number of iterations

In [11]:
# Training loop
for epoch in range(n_iter):
    random.shuffle(train_data_spacy)  # Shuffle training data each epoch
    losses = {}
    
    # Create batches of data
    batches = minibatch(train_data_spacy, size=compounding(4.0, 32.0, 1.001))
    
    for batch in batches:
        examples = []
        for text, annotations in batch:
            # Create Example objects
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, {"cats": annotations["cats"]})  # Multi-label format
            examples.append(example)
        
        # Update the model with the batch of Example objects
        nlp.update(examples, drop=0.5, losses=losses)
    
    print(f"Epoch {epoch + 1}, Loss: {losses['textcat_multilabel']}")

Epoch 1, Loss: 130.03918215052744
Epoch 2, Loss: 77.64193078371909


KeyboardInterrupt: 

In [12]:
# Collect predictions and true labels
true_labels = []
pred_labels = []

for text, annotations in val_data_spacy:  # val_data is a list of (text, annotations)
    # Convert the text and annotations into an Example
    example = spacy.training.Example.from_dict(nlp.make_doc(text), {"cats": annotations["cats"]})
    
    # Process the text with the model
    doc = nlp(example.text)
    
    # Collect the predictions
    pred_labels.append({label: doc.cats[label] for label in doc.cats})
    
    # Collect the true labels
    true_labels.append(annotations["cats"])

# Convert predictions to binary based on threshold
threshold = 0.5
pred_binary = [
    {label: int(score >= threshold) for label, score in pred.items()}
    for pred in pred_labels
]

# Extract the keys (categories/labels) in a consistent order
categories = list(true_labels[0].keys())

# Convert dictionaries to 2D arrays for sklearn
true_array = np.array([[label_dict[cat] for cat in categories] for label_dict in true_labels])
pred_array = np.array([[label_dict[cat] for cat in categories] for label_dict in pred_binary])

# Evaluate using sklearn's classification report
print(classification_report(true_array, pred_array, target_names=categories))

               precision    recall  f1-score   support

        toxic       0.83      0.74      0.78      3056
      obscene       0.86      0.77      0.81      1715
       threat       0.00      0.00      0.00        74
       insult       0.77      0.65      0.70      1614
identity_hate       0.80      0.11      0.20       294

    micro avg       0.83      0.69      0.75      6753
    macro avg       0.66      0.45      0.50      6753
 weighted avg       0.82      0.69      0.74      6753
  samples avg       0.07      0.06      0.06      6753



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
