In [None]:
# Install spaCy (if not already installed)
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [None]:
# Install pandas and scikit-learn for data handling and evaluation
#!pip install pandas scikit-learn

In [None]:
# Install nltk
# !pip install nltk
import nltk
nltk.download("punkt")  # Tokenization
nltk.download("stopwords")  # Stop words
nltk.download("vader_lexicon")  # Sentiment analysis
nltk.download("wordnet")  # Lemmatization


In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from spacy.pipeline.textcat import Config, single_label_cnn_config, single_label_bow_config, single_label_default_config
from spacy.training import Example
from spacy.util import minibatch, compounding
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
import numpy as np
import random
import re
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Load train.csv
train_data = pd.read_csv("train.csv")

# Display the first few rows to understand the structure
train_data.head()


In [None]:
# Check for missing values and overall dataset structure
print("Missing values in the dataset:")
print(train_data.isnull().sum())
train_data.info()
train_data.describe()

In [None]:
#Disclaimerhiscelltakes22minutestorunonmymachine# Load stop words and spacy model
stop_words = set(stopwords.words("english"))
nlp = spacy.load("en_core_web_sm")

# Function to clean and preprocess text
def preprocess_text(text):
    """Clean and preprocess text data."""
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text)
    
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    
    # Remove punctuation, but keep "!" and "?" for context
    text = re.sub(r"[^\w\s!?]", "", text)
    
    # Normalize repeated characters ("soooo" -> "soo")
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    
    # Reconstruct text from processed tokens
    return " ".join(tokens)

# Function to transform the data into spacy format
def prepare_spacy_data(comments, labels):
    """Transform text and labels into spacy-compatible format."""
    spacy_data = []
    for comment, label in zip(comments, labels.values):
        cats = {col: bool(value) for col, value in zip(labels.columns, label)}  # Convert labels to binary dict
        spacy_data.append((comment, {"cats": cats}))
    return spacy_data

# Define inputs and outputs
X = train_data["comment_text"]
y = train_data.drop(columns=["id", "comment_text"])

# Preprocess text data
X_cleaned = X.apply(preprocess_text)

# Print sample of preprocessed text
print("Sample cleaned comments:")
print(X_cleaned[:3])

# Inspect labels for imbalance
print("Label distribution:")
print(y.sum(axis=0))

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_cleaned, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

# Transform preprocessed data into spaCy format
train_data_spacy = prepare_spacy_data(X_train, y_train)
val_data_spacy = prepare_spacy_data(X_val, y_val)

# Print sample of processed spaCy data
print("Sample processed data (spaCy format):")
print(train_data_spacy[:3])


In [None]:
# Drop the 'severe_toxic' class because of redunancy with toxic
train_data = train_data.drop(columns=['severe_toxic'])
train_data.head()

In [None]:
# Define input (comments) and output (labels)
X = train_data['comment_text']
y = train_data.drop(columns=['id', 'comment_text'])

# Inspect labels for imbalance
print("Label distribution:")
print(y.sum(axis=0))  # Check class counts per label

# Split the data (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

In [None]:
# Function to transform the data into spaCy format
def prepare_spacy_data(comments, labels):
    spacy_data = []
    for comment, label in zip(comments, labels.values):
        # Create a dictionary of labels with their binary values
        cats = {col: bool(value) for col, value in zip(labels.columns, label)}
        spacy_data.append((comment, {'cats': cats}))
    return spacy_data

# Prepare training and validation data
train_data_spacy = prepare_spacy_data(X_train, y_train)
val_data_spacy = prepare_spacy_data(X_val, y_val)

# Print a sample of the processed data
print("Sample processed data (spaCy format):")
print(train_data_spacy[:3])

In [None]:
# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespaces
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text

# Apply preprocessing to training and validation data
X_train_cleaned = X_train.apply(preprocess_text)
X_val_cleaned = X_val.apply(preprocess_text)

# Prepare spaCy data again with cleaned text
train_data_spacy = prepare_spacy_data(X_train_cleaned, y_train)
val_data_spacy = prepare_spacy_data(X_val_cleaned, y_val)

print("Sample cleaned data (spaCy format):")
print(train_data_spacy[:3])

In [None]:
# Save processed data to files (optional)
# import json

# with open("train_data_spacy.json", "w") as f:
#     json.dump(train_data_spacy, f)

# with open("val_data_spacy.json", "w") as f:
#     json.dump(val_data_spacy, f)

# print("Preprocessed data saved as JSON files.")

In [None]:
# Create a blank spaCy pipeline for English
nlp = spacy.blank("en")

# Print a confirmation
print("Blank spaCy pipeline created.")


In [None]:
config = {
    "threshold": 0.5,  # Classification threshold
    "model": {
        "@architectures": "spacy.TextCatEnsemble.v2",
        "tok2vec": {
            "@architectures": "spacy.Tok2Vec.v2",
            "embed": {
                "@architectures": "spacy.MultiHashEmbed.v2",
                "width": 64,
                "rows": [2000, 2000, 500, 1000, 500],
                "attrs": ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"],
                "include_static_vectors": False,
            },
            "encode": {
                "@architectures": "spacy.MaxoutWindowEncoder.v2",
                "width": 64,
                "window_size": 1,
                "maxout_pieces": 3,
                "depth": 2,
            },
        },
        "linear_model": {
            "@architectures": "spacy.TextCatBOW.v3",
            "exclusive_classes": False,  # Multi-label classification
            "ngram_size": 1,
            "no_output_layer": False,
            "length": 262144,  # Add length explicitly to avoid further errors
        },
    },
}

# Add the multi-label text categorizer
textcat = nlp.add_pipe("textcat_multilabel", config=config)

# Add labels (categories) to the text categorizer
for label in y_train.columns:  # Assuming y_train.columns contains category names
    textcat.add_label(label)

In [None]:
# Initialize optimizer
optimizer = nlp.begin_training()

# Training parameters
n_iter = 1  # Number of iterations

In [None]:
# Training loop
for epoch in range(n_iter):
    random.shuffle(train_data_spacy)  # Shuffle training data each epoch
    losses = {}
    
    # Create batches of data
    batches = minibatch(train_data_spacy, size=compounding(4.0, 32.0, 1.001))
    
    for batch in batches:
        examples = []
        for text, annotations in batch:
            # Create Example objects
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, {"cats": annotations["cats"]})  # Multi-label format
            examples.append(example)
        
        # Update the model with the batch of Example objects
        nlp.update(examples, drop=0.5, losses=losses)
    
    print(f"Epoch {epoch + 1}, Loss: {losses['textcat_multilabel']}")

In [None]:
# Collect predictions and true labels
true_labels = []
pred_labels = []

for text, annotations in val_data_spacy:  # val_data is a list of (text, annotations)
    # Convert the text and annotations into an Example
    example = spacy.training.Example.from_dict(nlp.make_doc(text), {"cats": annotations["cats"]})
    
    # Process the text with the model
    doc = nlp(example.text)
    
    # Collect the predictions
    pred_labels.append({label: doc.cats[label] for label in doc.cats})
    
    # Collect the true labels
    true_labels.append(annotations["cats"])

# Convert predictions to binary based on threshold
threshold = 0.5
pred_binary = [
    {label: int(score >= threshold) for label, score in pred.items()}
    for pred in pred_labels
]

# Extract the keys (categories/labels) that are present in all true labels
categories = list(set.intersection(*[set(label_dict.keys()) for label_dict in true_labels])) # Made categories only include labels present in all true_labels to avoid KeyError

# Convert dictionaries to 2D arrays for sklearn
true_array = np.array([[label_dict.get(cat, 0) for cat in categories] for label_dict in true_labels])
pred_array = np.array([[label_dict.get(cat, 0) for cat in categories] for label_dict in pred_binary])

# Evaluate using sklearn's classification report
print(classification_report(true_array, pred_array, target_names=categories))

Yigit's stuff