# Setup

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import re 
import scipy
from scipy import sparse
from matplotlib import pyplot as plt
from xgboost.callback import TrainingCallback

In [None]:
import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
import xgboost

In [None]:
# ========== unify random seeds ==========
import os, random, torch
import transformers
from transformers.trainer_utils import set_seed           # HuggingFace exclusive
import xgboost                             

SEED = 42                                   

# 1) Python built-in
random.seed(SEED)

# 2) environment variable
#    (Python hash randomization, not all Python versions)
os.environ["PYTHONHASHSEED"] = str(SEED)

# 3) NumPy
np.random.seed(SEED)

# 4) PyTorch
torch.manual_seed(SEED)                     # CPU
torch.cuda.manual_seed_all(SEED)            # all GPU
torch.backends.cudnn.deterministic = True   # algorithm deterministic
torch.backends.cudnn.benchmark = False      


# 5) HuggingFace（set random & numpy & torch all in once）
# set_seed(SEED)

# 6) XGBoost —— Specify random_state/seed in model / DMatrix.
xgb_rand_params = {
    "random_state": SEED,   # Synonymous with sklearn's random_state
    "seed": SEED            # XGBoost own seed
}

# Data preparation

In [None]:
train = pd.read_csv(filepath_or_buffer="/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")
comm_score = pd.read_csv(filepath_or_buffer="../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
'''
This code assigns a weight to each toxic label based on the severity of the comment.
We then sum the weighted labels to create a new score for each comment, allowing us to do regression
instead of classification.
The weights are as follows:
- obscene: 0.16
- toxic: 0.32
- threat: 1.5
- insult: 0.64
- severe_toxic: 1.5
- identity_hate: 1.5
'''
label_score = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in label_score:
    train[category] = train[category] * label_score[category]

train['score'] = train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

train['y'] = train['score']

train.head()

In [None]:
train.shape

In [None]:
train = train.rename(columns={'comment_text':'comment'})

In [None]:
'''
This function is used to clean the text data by removing unwanted characters, links, emojis, and HTML tags.
It uses regular expressions to identify and remove these elements, and BeautifulSoup to parse HTML content.
The function takes a string input 'text' and returns a cleaned version of the text.
The cleaning process includes:
1. Removing website links using a regex pattern.
2. Parsing the text with BeautifulSoup to remove HTML tags.
3. Removing emojis using a regex pattern.
4. Removing special characters and non-alphanumeric characters.
5. Removing extra spaces.   
Arguments:
    text: str : The input text to be cleaned.
Returns:
    str : The cleaned text.
'''

def text_cleaning(text):
    
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
train['comment'] = train['comment'].apply(text_cleaning)

In [None]:
df = train.copy()

In [None]:
df['y'].value_counts()

In [None]:
df.head()

# BERT Encoding Method

In [None]:

train_texts, val_texts, train_targets, val_targets = train_test_split(
    df['comment'].tolist(), 
    df['y'].values, 
    test_size=0.2,        # 20% as validation set
    random_state=42, 
    shuffle=True
)

In [None]:
len(train_texts)

## Feature Engineering Improvements
Use richer text vectors
Current use: TF-IDF (sparse, bag-of-words model)

Improvement Direction:

Use [CLS] vectors or mean pooling of Transformer representations (e.g. BERT, RoBERTa), which is often much better than TF-IDF.

In [None]:
model_name = 'bert-base-uncased'
model_path = '/kaggle/input/bert_base_uncased/pytorch/default/1/bert_base_uncased'

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
bert = transformers.AutoModel.from_pretrained(model_name)
bert.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert.to(device)

In [None]:
device

In [None]:
'''
This funtion serves to extract the [CLS] token embeddings from the BERT model.
'''
# extract [CLS] embedding function
def get_cls_embeddings(texts, batch_size=16, max_length=128):
    all_embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            encoding = tokenizer(
                batch_texts,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=max_length
            ).to(device)

            outputs = bert(**encoding)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

In [None]:
X_train = get_cls_embeddings(train_texts)
X_val = get_cls_embeddings(val_texts)

## Hyperparameter selection
The code cell below  is used for grid search with 5-fold cross-validation


In [None]:
from sklearn.model_selection import GridSearchCV
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# define the XGBoost regressor with BERT embeddings
xgb_regressor_with_bert = xgboost.XGBRegressor(
    n_estimators=1500,
    objective='reg:squarederror',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    reg_lambda=5.0,
    reg_alpha=5.0,
)

# define the parameter grid
param_grid = {
    'n_estimators': [300, 500],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.7, 0.9]
}

# define the grid search
grid_search = GridSearchCV(
    estimator=xgb_regressor_with_bert,
    param_grid=param_grid,
    cv=5,  # 5 folds cross-validation
    scoring='neg_root_mean_squared_error',
    verbose=2,
    n_jobs=-1  # use all available cores
)

# execute the grid search
grid_search.fit(X_train, train_targets)

# output the best parameters and score
print("best parameters:", grid_search.best_params_)
print("best score(neg MSE):", grid_search.best_score_)
# best parameters: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 500, 'subsample': 0.9}
# best score(neg MSE): -0.41994513168325326

## Training

In [None]:
'''
This class is used to record the RMSE (Root Mean Square Error) during the training process of an XGBoost model.
It inherits from the TrainingCallback class of XGBoost.
The class has two attributes:
- train_rmse: A list to store the RMSE values for the training set.
- val_rmse: A list to store the RMSE values for the validation set.
The class has one method:
- after_iteration: This method is called after each iteration of the training process.
  It checks if the RMSE values for the training and validation sets are present in the evals_log dictionary,
  and if so, appends them to the respective lists. The method returns False to indicate that training should continue.
'''

class RMSERecorder(TrainingCallback):
    def __init__(self):
        self.train_rmse = []
        self.val_rmse = []

    def after_iteration(self, model, epoch, evals_log):
        if 'validation_0' in evals_log and 'rmse' in evals_log['validation_0']:
            self.train_rmse.append(evals_log['validation_0']['rmse'][-1])
        if 'validation_1' in evals_log and 'rmse' in evals_log['validation_1']:
            self.val_rmse.append(evals_log['validation_1']['rmse'][-1])
        return False  # 继续训练


In [None]:

recorder = RMSERecorder()

# define the model: XGBoost regressor with BERT embeddings
xgb_regressor_with_bert = xgboost.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.7,
    reg_alpha=5.0,
    reg_lambda=5.0,
    tree_method='hist',
    device='cuda',
    random_state=42
)

# Train and record RMSE
xgb_regressor_with_bert.fit(
    X_train, train_targets,
    eval_set=[(X_train, train_targets), (X_val, val_targets)],
    eval_metric='rmse',
    callbacks=[recorder],
    verbose=False,
    early_stopping_rounds=30
)

# Visualize the RMSE training curves
plt.figure(figsize=(8,5))
plt.plot(recorder.train_rmse, label='Train RMSE')
plt.plot(recorder.val_rmse, label='Validation RMSE')
plt.xlabel("Boosting Round")
plt.ylabel("RMSE")
plt.title("Train vs Validation RMSE Curve (Bert Features)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
best_iter = xgb_regressor_with_bert.best_iteration
print(f"Best iter at: {best_iter}")

## Validation

In [None]:
val_data = pd.read_csv("..input/jigsaw-toxic-severity-rating/validation_data.csv")
val_data.head()

In [None]:
val_data['less_toxic'] = val_data['less_toxic'].apply(text_cleaning)
val_data['more_toxic'] = val_data['more_toxic'].apply(text_cleaning)

In [None]:
X_less = get_cls_embeddings(val_data['less_toxic'].tolist())
X_more = get_cls_embeddings(val_data['more_toxic'].tolist())

p_less = xgb_regressor_with_bert.predict(X_less)
p_more = xgb_regressor_with_bert.predict(X_more)

In [None]:
accuracy = (p_more > p_less).mean()
print(f"BERT + XGBoost Pairwise Accuracy: {accuracy:.4f}")

# TF-IDF (sparse, bag-of-words model)

Now we will use TF-IDF features

In [None]:
# define TF-IDF vectorizer
vec = TfidfVectorizer(sublinear_tf=True, min_df= 3, max_df=0.5, max_features=50000)
# fit and transform the training data
X_train_tfidf = vec.fit_transform(train_texts)
# transform the validation data
X_val_tfidf = vec.transform(val_texts)

## Training

In [None]:

recorder = RMSERecorder()

# define the model: XGBoost regressor with TF-IDF features
xgb_regressor_with_tf_idf = xgboost.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.7,
    reg_alpha=5.0,
    reg_lambda=5.0,
    tree_method='hist',
    device='cuda',
    random_state=42
)

# Train and Record RMSE
xgb_regressor_with_tf_idf.fit(
    X_train_tfidf, train_targets,
    eval_set=[(X_train_tfidf, train_targets), (X_val_tfidf, val_targets)],
    eval_metric='rmse',
    callbacks=[recorder],
    verbose=False,
    early_stopping_rounds=30
)

# visualize the RMSE training curves
plt.figure(figsize=(8,5))
plt.plot(recorder.train_rmse, label='Train RMSE')
plt.plot(recorder.val_rmse, label='Validation RMSE')
plt.xlabel("Boosting Round")
plt.ylabel("RMSE")
plt.title("Train vs Validation RMSE Curve (IF-TDF features)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
best_iter = xgb_regressor_with_tf_idf.best_iteration
print(f"Best iter at: {best_iter}")

## Validation

In [None]:
X_less_toxic = vec.transform(val_data['less_toxic'])
X_more_toxic = vec.transform(val_data['more_toxic'])

In [None]:
p_less = xgb_regressor_with_tf_idf.predict(X_less_toxic)
p_more = xgb_regressor_with_tf_idf.predict(X_more_toxic)

In [None]:
# Proportion of models correct: predicted more toxic scores > less toxic scores
accuracy = (p_more > p_less).mean()
print(f"Pairwise Accuracy: {accuracy:.4f}")

# Test and submission

In [None]:
comm_score['text'] = comm_score['text'].apply(text_cleaning)

In [None]:
# X_test = vec.transform(comm_score['text'].tolist())

# p_test = xgb_regressor_with_tf_idf.predict(X_test)

In [None]:
X_test = get_cls_embeddings(comm_score['text'].tolist())

p_test = xgb_regressor_with_bert.predict(X_test)

In [None]:
comm_score['score'] = p_test

In [None]:
comm_score['score'].count()

In [None]:
comm_score[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
comm_score.head()