In [None]:
# Install spaCy (if not already installed)
!pip install spacy

In [None]:
# Install pandas and scikit-learn for data handling and evaluation
!pip install pandas scikit-learn

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from spacy.pipeline.textcat import Config, single_label_cnn_config, single_label_bow_config, single_label_default_config
from spacy.training import Example
from spacy.util import minibatch, compounding
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
import numpy as np
import random
import re

In [2]:
# Load train.csv
train_data = pd.read_csv("train.csv")

# Display the first few rows to understand the structure
train_data.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
# Check for missing values and overall dataset structure
# print("Missing values in the dataset:")
# print(train_data.isnull().sum())
# train_data.info()
# train_data.describe()

In [3]:
# Keep only the 'toxic' label for single-label classification
train_data = train_data[['id', 'comment_text', 'toxic']]
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            159571 non-null  object
 1   comment_text  159571 non-null  object
 2   toxic         159571 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [4]:
# Define input (comments) and output (labels)
X = train_data['comment_text']
y = train_data['toxic']  # Single label for toxicity

# Inspect label distribution
print("Label distribution:")
print(y.value_counts())  # Check class counts

# Split the data (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

Label distribution:
toxic
0    144277
1     15294
Name: count, dtype: int64
Training samples: 127656, Validation samples: 31915


In [5]:
# Function to transform the data into spaCy format
def prepare_spacy_data(comments, labels):
    spacy_data = []
    for comment, label in zip(comments, labels.values):
        # Create a dictionary of labels with their binary values
        cats = {"toxic": bool(label), "non_toxic": not bool(label)}
        spacy_data.append((comment, {'cats': cats}))
    return spacy_data

# Prepare training and validation data
train_data_spacy = prepare_spacy_data(X_train, y_train)
val_data_spacy = prepare_spacy_data(X_val, y_val)

# Print a sample of the processed data
print("Sample processed data (spaCy format):")
print(train_data_spacy[:3])

Sample processed data (spaCy format):
[('Grandma Terri Should Burn in Trash \nGrandma Terri is trash. I hate Grandma Terri. F%%K her to HELL! 71.74.76.40', {'cats': {'toxic': True, 'non_toxic': False}}), (', 9 May 2009 (UTC)\nIt would be easiest if you were to admit to being a member of the involved Portuguese Lodge, and then there would be no requirement to acknowledge whether you had a previous account (Carlos Botelho did not have a good record) or not and I would then remove the sockpuppet template as irrelevant. WP:COI permits people to edit those articles, such as MSJapan does, but just means you have to be more careful in ensuring that references back your edits and that NPOV is upheld.   20:29', {'cats': {'toxic': False, 'non_toxic': True}}), ('"\n\nThe Objectivity of this Discussion is doubtful (non-existent)\n\n(1) As indicated earlier, the section on Marxist leaders’ views is misleading:\n\n(a) it lays unwarranted and excessive emphasis on Trotsky, creating the misleading imp

In [6]:
# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespaces
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text

# Apply preprocessing to training and validation data
X_train_cleaned = X_train.apply(preprocess_text)
X_val_cleaned = X_val.apply(preprocess_text)

# Prepare spaCy data again with cleaned text
train_data_spacy = prepare_spacy_data(X_train_cleaned, y_train)
val_data_spacy = prepare_spacy_data(X_val_cleaned, y_val)

print("Sample cleaned data (spaCy format):")
print(train_data_spacy[:3])

Sample cleaned data (spaCy format):
[('grandma terri should burn in trash grandma terri is trash i hate grandma terri fk her to hell 71747640', {'cats': {'toxic': True, 'non_toxic': False}}), (' 9 may 2009 utc it would be easiest if you were to admit to being a member of the involved portuguese lodge and then there would be no requirement to acknowledge whether you had a previous account carlos botelho did not have a good record or not and i would then remove the sockpuppet template as irrelevant wpcoi permits people to edit those articles such as msjapan does but just means you have to be more careful in ensuring that references back your edits and that npov is upheld 2029', {'cats': {'toxic': False, 'non_toxic': True}}), (' the objectivity of this discussion is doubtful nonexistent 1 as indicated earlier the section on marxist leaders views is misleading a it lays unwarranted and excessive emphasis on trotsky creating the misleading impression that other prominent marxists marx engel

In [None]:
# Save processed data to files (optional)
# import json

# with open("train_data_spacy.json", "w") as f:
#     json.dump(train_data_spacy, f)

# with open("val_data_spacy.json", "w") as f:
#     json.dump(val_data_spacy, f)

# print("Preprocessed data saved as JSON files.")

In [7]:
# Create a blank spaCy pipeline for English
nlp = spacy.blank("en")

# Print a confirmation
print("Blank spaCy pipeline created.")


Blank spaCy pipeline created.


In [8]:
config = Config().from_str(single_label_bow_config)
textcat = nlp.add_pipe("textcat", config=config)

# Add the 2 oposit labels to the text categorizer
textcat.add_label("toxic")
textcat.add_label("non_toxic")

analysis = nlp.analyze_pipes(pretty=True)
print(analysis)

[1m

#   Component   Assigns    Requires   Scores            Retokenizes
-   ---------   --------   --------   ---------------   -----------
0   textcat     doc.cats              cats_score        False      
                                      cats_score_desc              
                                      cats_micro_p                 
                                      cats_micro_r                 
                                      cats_micro_f                 
                                      cats_macro_p                 
                                      cats_macro_r                 
                                      cats_macro_f                 
                                      cats_macro_auc               
                                      cats_f_per_type              

[38;5;2m✔ No problems found.[0m
{'summary': {'textcat': {'assigns': ['doc.cats'], 'requires': [], 'scores': ['cats_score', 'cats_score_desc', 'cats_micro_p', 'cats_micro_r', '

In [9]:
# Initialize optimizer
optimizer = nlp.begin_training()

# Training parameters
n_iter = 10  # Number of iterations

In [10]:
# Training loop
for epoch in range(n_iter):
    random.shuffle(train_data_spacy)  # Shuffle training data each epoch
    losses = {}
    
    # Create batches of data
    batches = minibatch(train_data_spacy, size=compounding(4.0, 32.0, 1.001))
    
    for batch in batches:
        examples = []
        for text, annotations in batch:
            # Create Example objects
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, {"cats": annotations["cats"]})  # Single-label format
            examples.append(example)
        
        # Update the model with the batch of Example objects
        nlp.update(examples, drop=0.5, losses=losses)
    
    print(f"Epoch {epoch + 1}, Loss: {losses['textcat']}")

Epoch 1, Loss: 265.2737813132777
Epoch 2, Loss: 151.47489542328802


KeyboardInterrupt: 

In [11]:
# Collect predictions and true labels
true_labels = []
pred_labels = []

for text, annotations in val_data_spacy:  # val_data is a list of (text, annotations)
    # Convert the text and annotations into an Example
    example = spacy.training.Example.from_dict(nlp.make_doc(text), {"cats": annotations["cats"]})
    
    # Process the text with the model
    doc = nlp(example.text)
    
    # Collect the predictions
    pred_labels.append({label: doc.cats[label] for label in doc.cats})
    
    # Collect the true labels
    true_labels.append(annotations["cats"])

# Convert predictions to binary based on threshold
threshold = 0.5
pred_binary = [
    {label: int(score >= threshold) for label, score in pred.items()}
    for pred in pred_labels
]

# Extract the keys (categories/labels) in a consistent order
categories = list(true_labels[0].keys())

# Convert dictionaries to 2D arrays for sklearn
true_array = np.array([[label_dict[cat] for cat in categories] for label_dict in true_labels])
pred_array = np.array([[label_dict[cat] for cat in categories] for label_dict in pred_binary])

# Evaluate using sklearn's classification report
print(classification_report(true_array, pred_array, target_names=categories))

              precision    recall  f1-score   support

       toxic       0.87      0.68      0.76      3056
   non_toxic       0.97      0.99      0.98     28859

   micro avg       0.96      0.96      0.96     31915
   macro avg       0.92      0.83      0.87     31915
weighted avg       0.96      0.96      0.96     31915
 samples avg       0.96      0.96      0.96     31915



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
