# Install Package

In [1]:
# !rm -rf /kaggle/working/*

In [2]:
# %pip install --upgrade scikit-learn

In [3]:
from tqdm import tqdm
import os
import time
import re
import joblib
import torch
import logging
import nltk
import seaborn as sns
from imblearn.over_sampling import SMOTE
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, log_loss, hinge_loss
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from collections import Counter
from scipy.stats import randint


# Cute processing functions

In [None]:
# ===========================================================================================
class MyLogger:
    """
    Custom logger class that logs messages to both a file and the console.
    """
    def __init__(self, log_file='app.log'):
        """
        Initialize the logger with a log file.
        Parameters:
        - log_file (str): Path to the log file. Default is 'app.log'.
        """
        self.log_file = log_file
        self._initialize_logger()

    def _initialize_logger(self):
        """
        Set up the logger by creating file and console handlers.
        If the log file already exists, logs are appended to it.
        """
        # Set file mode based on whether the log file already exists
        if os.path.exists(self.log_file):
            file_mode = 'a'
        else:
            file_mode = 'w'

        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)

        # Set up file handler
        file_handler = logging.FileHandler(self.log_file, mode=file_mode, encoding='utf-8')
        file_handler.setLevel(logging.INFO)

        # Set up console handler
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)

        # Define log format
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        console_handler.setFormatter(formatter)

        # Clear existing handlers to prevent duplicates
        if self.logger.hasHandlers():
            self.logger.handlers.clear()

        # Add handlers to the logger
        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)

    def log_message(self, message):
        """Log an informational message."""
        self.logger.info(message)

    def change_log_file(self, new_log_file):
        """
        Change the log file and reinitialize the logger.
        Parameters:
        - new_log_file (str): Path to the new log file.
        """
        self.log_file = new_log_file
        self._initialize_logger()

# ============================================================================================
# danh sách các biến toàn cục
logger = MyLogger()

# Enable/Disable tokenizers parallelism to avoid the warning
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# ============================================================================================
# Danh sách các hàm xử lý
# ============================================================================================
# Load the stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    """
    Remove stop words from the input text.
    Parameters:
    - text (str): The text to process.
    Returns:
    - (str): Text after removing stop words.
    """
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def clean_text(text):
    """
    Clean and preprocess text by removing URLs, brackets, special characters, and digits.
    Parameters:
    - text (str): The text to clean.
    Returns:
    - (str): Cleaned text.
    """
    text = re.sub(r'http\S+|https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'\n|\t|\r|\f|\b', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\W+', ' ', text).lower().strip()
    tokens = word_tokenize(text)
    return ' '.join(tokens)

# Fix this load and separate dataset if it is imbalanced
def load_data(file_path):
    """
    Load dataset, check for class imbalance, and apply SMOTE if needed.
    Splits data into train, validation, and test sets.
    Parameters:
    - file_path (str): Path to the CSV file.
    Returns:
    - (tuple): DataFrames for balanced dataset, train, test, and validation sets.
    """
    df_ds = pd.read_csv(file_path)
    label_counts = df_ds['label'].value_counts()
    
    imbalance_threshold = 0.5
    minority_class_ratio = label_counts.min() / label_counts.max()

    if minority_class_ratio < imbalance_threshold:
        X = df_ds.drop(columns=['label'])
        y = df_ds['label']
        smote = SMOTE(random_state=42)
        X_balanced, y_balanced = smote.fit_resample(X, y)
        df_ds_balanced = pd.DataFrame(X_balanced, columns=X.columns)
        df_ds_balanced['label'] = y_balanced
    else:
        df_ds_balanced = df_ds

    train_essays, test_essays = train_test_split(df_ds_balanced, test_size=0.2, random_state=42)
    train_essays, val_essays = train_test_split(train_essays, test_size=0.33, random_state=42)
    
    return df_ds_balanced, train_essays, test_essays, val_essays

def compute_metrics(preds, labels):
    """
    Calculate evaluation metrics for binary classification.
    Parameters:
    - preds (array): Predicted probabilities.
    - labels (array): Actual labels.
    Returns:
    - (dict): Dictionary of metrics (ROC AUC, accuracy, precision, recall, F1).
    """
    binary_preds = (preds >= 0.5).astype(int)
    auc = roc_auc_score(labels, preds)
    accuracy = accuracy_score(labels, binary_preds)
    precision = precision_score(labels, binary_preds, zero_division=0)
    recall = recall_score(labels, binary_preds)
    f1 = f1_score(labels, binary_preds, zero_division=0)
    return {"roc_auc": auc, "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def sent2vec(s, glove_model):
    """
    Convert a sentence to a vector using GloVe embeddings.
    Parameters:
    - s (str): Input sentence.
    - glove_model (dict): Pre-trained GloVe embeddings.
    Returns:
    - (array): Vector representation of the sentence.
    """
    words = str(s).lower()
    words = word_tokenize(words)  # This requires the 'punkt' tokenizer
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if w.isalpha()]  # Filter out non-alphabetic tokens
    M = []
    for w in words:
        try:
            M.append(glove_model[w])  # Lookup word in GloVe model
        except KeyError:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)  # Return a zero vector if no word embeddings are found
    return v / np.sqrt((v ** 2).sum())  # Normalize the vector

# ============================================================================================
def binary_cross_entropy_loss(y_true, y_pred):
    """
    Calculate binary cross-entropy loss.
    Parameters:
    - y_true (array): True labels.
    - y_pred (array): Predicted probabilities.
    Returns:
    - (float): Binary cross-entropy loss.
    """
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# ============================================================================================
def shuffle_and_split(X, y, batch_size):
    """
    Shuffle the data and split it into batches.
    Parameters:
    - X (array): Feature data.
    - y (array): Labels.
    - batch_size (int): Batch size.
    Returns:
    - (tuple): List of batches for X and y.
    """
    indices = np.random.permutation(len(y))
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    
    num_batches = len(y) // batch_size
    
    X_batches = np.array_split(X_shuffled, num_batches)
    y_batches = np.array_split(y_shuffled, num_batches)
    
    return X_batches, y_batches

# ============================================================================================
def train_logistic_regression(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path, n_epochs=20, batch_size=32):
    """
    Train a Logistic Regression model using Stochastic Gradient Descent (SGD) for binary classification.
    
    This function trains a logistic regression model incrementally using mini-batches. If a pre-trained model 
    is found at the specified path, it loads that model and evaluates it on the training and validation sets 
    without retraining. If no pre-trained model is found, it initializes a new model and trains it using 
    the provided training data.

    The function handles class imbalance by calculating class weights and applying them during training.
    The model is trained incrementally using `partial_fit` with each mini-batch of data, allowing it to 
    handle large datasets efficiently.

    Parameters:
    - X_train_embeddings (numpy array): Feature embeddings for the training set.
    - y_train (numpy array): Labels for the training set.
    - X_val_embeddings (numpy array): Feature embeddings for the validation set.
    - y_val (numpy array): Labels for the validation set.
    - model_file_path (str): Path to save the trained model or load a pre-existing one.
    - out_base_path (str): Base path for saving output files (not used directly in this function).
    - n_epochs (int, optional): Number of epochs to train the model. Default is 20.
    - batch_size (int, optional): Size of the mini-batches for training. Default is 32.
    
    Returns:
    - model (SGDClassifier): The trained Logistic Regression model.
    
    Function Workflow:
    1. Logs the start of the training process.
    2. Checks if a model file already exists:
        - If yes, loads the model and evaluates it on the train/validation sets.
        - If no, initializes a new SGDClassifier with logistic loss and trains it.
    3. If a new model is being trained:
        - Computes class weights for handling class imbalance.
        - Trains the model using mini-batches for the specified number of epochs.
        - Uses `partial_fit` for incremental training with class weights applied to each batch.
    4. Returns the trained model.
    """
    logger.log_message(f"Training a Logistic Regression model for {n_epochs} epochs...")

    # Check if model already exists
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        model = joblib.load(model_file_path)
        print("Model loaded. Skipping training and evaluating on train/validation sets.")
        
        # Evaluate the loaded model without retraining
        preds_train = model.predict_proba(X_train_embeddings)[:, 1]
        preds_val = model.predict_proba(X_val_embeddings)[:, 1]
        
        train_loss = binary_cross_entropy_loss(y_train, preds_train)
        val_loss = binary_cross_entropy_loss(y_val, preds_val)

        train_metrics = compute_metrics(preds_train, y_train)
        val_metrics = compute_metrics(preds_val, y_val)

        print(f"Train Loss: {train_loss:.4f}, Train Metrics: {train_metrics}")
        print(f"Validation Loss: {val_loss:.4f}, Validation Metrics: {val_metrics}")

        # No training occurred, but we can plot metrics for evaluation only if needed
        return model

    # If model does not exist, train a new one
    print("Model not found. Training a new model...")
    model = SGDClassifier(loss='log_loss', max_iter=1, warm_start=True)

    # Calculate class weights for handling class imbalance
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    class_weight_dict = {i: weight for i, weight in zip(classes, class_weights)}
    
    # Training loop for the specified number of epochs
    for epoch in tqdm(range(n_epochs), desc="Epochs", unit="epoch"):
        logger.log_message(f"Epoch {epoch + 1}/{n_epochs} - Shuffling and batching data")

        # Shuffle and split the training data into mini-batches
        X_train_batches, y_train_batches = shuffle_and_split(X_train_embeddings, y_train, batch_size)

        # Incrementally train the model using mini-batches
        for X_batch, y_batch in zip(X_train_batches, y_train_batches):
            # Calculate sample weights for each batch based on class weights
            sample_weights = np.array([class_weight_dict[label] for label in y_batch])
            model.partial_fit(X_batch, y_batch, classes=classes, sample_weight=sample_weights)

    return model

# ============================================
def train_xgboost(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path, n_estimators=100, learning_rate=0.01):
    """
    Train an XGBoost model for binary classification with early stopping.
    
    This function trains an XGBoost model on the provided training data, using a validation set to monitor 
    performance and stop early if there is no improvement. It handles class imbalance by adjusting the 
    `scale_pos_weight` parameter based on the distribution of classes. If a pre-trained model is found at 
    the specified path, it loads that model instead of retraining. 

    Parameters:
    - X_train_embeddings (numpy array): Feature embeddings for the training set.
    - y_train (numpy array): Labels for the training set.
    - X_val_embeddings (numpy array): Feature embeddings for the validation set.
    - y_val (numpy array): Labels for the validation set.
    - model_file_path (str): Path to save the trained model or load an existing one.
    - out_base_path (str): Base path for saving output files (not used directly in this function).
    - n_estimators (int, optional): Number of boosting rounds (trees) to train. Default is 100.
    - learning_rate (float, optional): Learning rate (shrinkage factor) for training. Default is 0.01.
    
    Returns:
    - model (XGBClassifier): The trained XGBoost model.
    
    Function Workflow:
    1. Logs the start of training with the specified number of estimators and learning rate.
    2. Calculates `scale_pos_weight` to handle class imbalance based on the class distribution.
    3. Checks if a model file already exists:
        - If yes, loads the model from disk and skips training.
        - If no, initializes a new XGBoost model and trains it using the provided data.
    4. Uses early stopping with the validation set to prevent overfitting.
    5. Saves the trained model to the specified file path.
    6. Returns the trained model.
    """
    logger.log_message(f"Training an XGBoost model with {n_estimators} boosting rounds and learning rate {learning_rate}...")

    # Calculate scale_pos_weight for imbalanced dataset
    counter = Counter(y_train)
    scale_pos_weight = counter[0] / counter[1]
    logger.log_message(f"Calculated scale_pos_weight: {scale_pos_weight}")

    # Check if model already exists
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        logger.log_message("Loading model from file...")
        model = joblib.load(model_file_path)
    else:
        print("Model not found. Training a new model...")
        logger.log_message("Model not found. Training a new model...")

        # Define the XGBoost model with supported evaluation metrics
        model = xgb.XGBClassifier(
            use_label_encoder=False,
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            eval_metric=['auc', 'logloss', 'error'],  # Supported metrics
            objective='binary:logistic',
            scale_pos_weight=scale_pos_weight,
            max_delta_step=1
        )

        # Train the model with evaluation on the validation set
        model.fit(
            X_train_embeddings, y_train,
            eval_set=[(X_train_embeddings, y_train), (X_val_embeddings, y_val)],
            early_stopping_rounds=20, verbose=True
        )

        # Retrieve evaluation results
        results = model.evals_result()
        boosting_rounds = range(len(results['validation_0']['auc']))

        # Save the trained model
        joblib.dump(model, model_file_path)
        print(f"Model saved to {model_file_path}")
        logger.log_message(f"Model saved to {model_file_path}")

    return model

# ============================================
def train_random_forest(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path):
    """
    Train a Random Forest model with hyperparameter tuning using RandomizedSearchCV.
    
    This function trains a Random Forest model on the provided training data. It uses 
    hyperparameter tuning with `RandomizedSearchCV` to find the best combination of parameters. 
    If a pre-trained model is found at the specified path, it loads that model instead of retraining. 
    Otherwise, it performs hyperparameter tuning, trains the model, and evaluates its performance on 
    both the training and validation sets.

    Parameters:
    - X_train_embeddings (numpy array): Feature embeddings for the training set.
    - y_train (numpy array): Labels for the training set.
    - X_val_embeddings (numpy array): Feature embeddings for the validation set.
    - y_val (numpy array): Labels for the validation set.
    - model_file_path (str): Path to save the trained model or load an existing one.
    - out_base_path (str): Base path for saving output files (not used directly in this function).
    
    Returns:
    - model (RandomForestClassifier): The trained Random Forest model.
    
    Function Workflow:
    1. Logs the start of the Random Forest training process with hyperparameter tuning.
    2. Checks if a model file already exists:
        - If yes, loads the model from disk and skips the training process.
        - If no, initializes a new Random Forest model and performs hyperparameter tuning.
    3. Defines a parameter distribution for `n_estimators` and `max_depth` for `RandomizedSearchCV`.
    4. Uses `RandomizedSearchCV` to find the best hyperparameters based on the ROC AUC score.
    5. Trains the best model on the full training set.
    6. Evaluates the model on both the training and validation sets, computing log loss, ROC AUC, and accuracy.
    7. Saves the trained model to the specified file path.
    8. Returns the trained model.
    """
    logger.log_message("Training a Random Forest model with hyperparameter tuning...")

    # Check if model file already exists
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        model = joblib.load(model_file_path)
        print("Model loaded. Skipping hyperparameter tuning and metric plotting.")
        return model
    else:
        print("Model not found. Performing hyperparameter tuning...")
        logger.log_message("Model not found. Performing hyperparameter tuning...")

        # Define the parameter distribution for RandomizedSearchCV
        param_dist = {
            'n_estimators': randint(50, 500),  # Number of trees in the forest
            'max_depth': randint(1, 20)        # Maximum depth of the trees
        }

        # Create a RandomForestClassifier model
        rf = RandomForestClassifier(random_state=42)

        # Use RandomizedSearchCV to find the best hyperparameters
        rand_search = RandomizedSearchCV(
            rf,
            param_distributions=param_dist,
            n_iter=5,             # Number of random samples to draw from the parameter distribution
            cv=5,                 # 5-fold cross-validation
            scoring='roc_auc',    # Optimize for ROC AUC score
            random_state=42,
            n_jobs=-1             # Use all available CPU cores
        )

        # Fit the random search object to the training data
        rand_search.fit(X_train_embeddings, y_train)

        # Get the best model from the random search
        model = rand_search.best_estimator_
        print(f"Best parameters found: {rand_search.best_params_}")

        # Train the best model on the full training data
        model.fit(X_train_embeddings, y_train)

        # Evaluate the model on the training set
        preds_train = model.predict_proba(X_train_embeddings)[:, 1]
        train_loss = log_loss(y_train, preds_train)
        train_auc = roc_auc_score(y_train, preds_train)
        train_accuracy = accuracy_score(y_train, (preds_train >= 0.5).astype(int))

        # Evaluate the model on the validation set
        preds_val = model.predict_proba(X_val_embeddings)[:, 1]
        val_loss = log_loss(y_val, preds_val)
        val_auc = roc_auc_score(y_val, preds_val)
        val_accuracy = accuracy_score(y_val, (preds_val >= 0.5).astype(int))

        # Log training and validation metrics
        print(f"Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Validation Loss: {val_loss:.4f}, Validation AUC: {val_auc:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        # Save the trained model to disk
        joblib.dump(model, model_file_path)
        print(f"Model saved to {model_file_path}")
        logger.log_message(f"Model saved to {model_file_path}")

    return model

# ============================================
def train_linear_svc(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path):
    """
    Train a Linear Support Vector Classifier (LinearSVC) model for binary classification.
    
    This function trains a LinearSVC model on the provided training data. If a pre-trained model is found 
    at the specified path, it loads that model instead of retraining. Otherwise, it trains a new LinearSVC 
    model and saves it to the specified path. The function uses a balanced class weight to handle class 
    imbalance in the dataset.

    Parameters:
    - X_train_embeddings (numpy array): Feature embeddings for the training set.
    - y_train (numpy array): Labels for the training set.
    - X_val_embeddings (numpy array): Feature embeddings for the validation set (not directly used in this function).
    - y_val (numpy array): Labels for the validation set (not directly used in this function).
    - model_file_path (str): Path to save the trained model or load an existing one.
    - out_base_path (str): Base path for saving output files (not used directly in this function).
    
    Returns:
    - model (LinearSVC): The trained LinearSVC model.
    
    Function Workflow:
    1. Logs the start of the LinearSVC training process.
    2. Checks if a model file already exists:
        - If yes, loads the model from disk and skips the training process.
        - If no, initializes a new LinearSVC model and trains it using the provided data.
    3. Saves the trained model to the specified file path.
    4. Returns the trained model.
    """
    logger.log_message("Training a LinearSVC model...")

    # Check if model file already exists
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        model = joblib.load(model_file_path)
    else:
        print("Model not found. Training a new model...")
        
        # Initialize the LinearSVC model with balanced class weights
        model = LinearSVC(
            max_iter=246,             # Maximum number of iterations for convergence
            class_weight='balanced',  # Handle class imbalance
            tol=1e-4,                 # Tolerance for stopping criteria
            C=1.0                     # Regularization parameter
        )
        
        # Train the model on the training data
        model.fit(X_train_embeddings, y_train)

        # Save the trained model to disk
        joblib.dump(model, model_file_path)
        print(f"Model saved to {model_file_path}")
        logger.log_message(f"Model saved to {model_file_path}")

    return model

# ============================================================================================


In [5]:
# Khởi tạo số luồng xử lý song song
# max_workers = 1 

# kaggle 
in_base_path = r"/kaggle/input/dath-pdz/"
out_base_path = r"/kaggle/working/"

# Fix the file path by adding the missing backslash or using os.path.join
file_name = os.path.join(in_base_path, 'final_dataset_v1_afternb1.csv')  # Correct file path

# Bắt đầu theo dõi thời gian
t_start_time = time.time()

# Load and preprocess data
df_ds, train_essays, test_essays, val_essays = load_data(file_name)

# Check the size of each set
print(f'Full set size: {len(df_ds)}')
print(f'Training set size: {len(train_essays)}')
print(f'Validation set size: {len(val_essays)}')
print(f'Test set size: {len(test_essays)}')

logger.log_message(f'Full set size: {len(df_ds)}')
logger.log_message(f'Training set size: {len(train_essays)}')
logger.log_message(f'Validation set size: {len(val_essays)}')
logger.log_message(f'Test set size: {len(test_essays)}')

# logger.log_message(f"Finished processing (total) in {t_hours} hours, {t_minutes} minutes, {t_seconds} seconds")     

# ============================================================================================
# Load the glove model
word2vec_output_file = get_tmpfile(r"/kaggle/input/pdz-dath-ds/output_w2v.txt")
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

# Prepare train and validation embeddings
X_train = train_essays['processed_text_swr'].tolist()
X_val = val_essays['processed_text_swr'].tolist()
y_train = train_essays['label'].values
y_val = val_essays['label'].values

# Prepare test data
X_test = test_essays['processed_text_swr'].tolist()
y_test = test_essays['label'].values

# Generate embeddings using GloVe model
print("Generating embeddings for train, validation, and test sets...")
X_train_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_train])
X_val_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_val])
X_test_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_test])

# ============================================================================================
# Apply SMOTE to balance the training set
from imblearn.over_sampling import SMOTE
from collections import Counter

print("\nApplying SMOTE to balance the training set...")
smote = SMOTE(sampling_strategy='not majority', random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_embeddings, y_train)

# Check class distribution after SMOTE
print("Label distribution after SMOTE:")
print(Counter(y_train_balanced))

# Convert the balanced data back to arrays
X_train_embeddings = np.array(X_train_balanced)
y_train = np.array(y_train_balanced)

print("\nDataset sizes after applying SMOTE:")
print(f"Balanced Training set size: {len(X_train_embeddings)}")
print(f"Validation set size: {len(X_val_embeddings)}")
print(f"Test set size: {len(X_test_embeddings)}")

################################
# Preprocess the reading new data
# Apply the clean_text function to the 'text' column and create a new 'processed_text' column
# Add this for reading data from competition
################################

df_competition = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
test_essays_competition = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

test_essays_competition['processed_text'] = test_essays_competition['text'].apply(clean_text)

test_essays_competition['processed_text_swr'] = test_essays_competition['processed_text'].apply(remove_stop_words)

X_test_comp = test_essays_competition['processed_text_swr'].tolist()

X_test_comp_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_test_comp])

################################

# ============================================================================================
# If you use LR to submit, uncomment these lines
# ============================================================================================

logger.log_message("============================================================================================")

# Train the Logistic Regression model using the train and validation sets
logistic_model_file_path = os.path.join(out_base_path, 'logistic_regression_model.pkl')

# def train_logistic_regression(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path, n_epochs=20, batch_size=32):
model = train_logistic_regression(X_train_embeddings, y_train, X_val_embeddings, y_val, logistic_model_file_path, out_base_path)

print("Testing Logistic Regression Model")
predictions = model.predict_proba(X_test_comp_embeddings)
output = pd.DataFrame({'id': test_essays_competition.id, 'generated': predictions[:, 1]})
output.to_csv('submission.csv', index=False)

print("Submission was successfully saved!")

# ============================================================================================
# If you use XGB to submit, uncomment these lines
# ============================================================================================

# logger.log_message("============================================================================================")

# # Train the XGBoost model using the train and validation sets
# xgboost_model_file_path = os.path.join(out_base_path, 'xgboost_model.pkl')

# model = train_xgboost(X_train_embeddings, y_train, X_val_embeddings, y_val, xgboost_model_file_path, out_base_path)

# print("Testing XGB Model")
# predictions = model.predict_proba(X_test_comp_embeddings)
# output = pd.DataFrame({'id': test_essays_competition.id, 'generated': predictions[:, 1]})
# output.to_csv('submission.csv', index=False)

# print("Submission was successfully saved!")

# ============================================================================================
# If you use RF to submit, uncomment these lines
# ============================================================================================

# logger.log_message("============================================================================================")

# # Train the Random Forest model using the train and validation sets
# randomforest_model_file_path = os.path.join(out_base_path, 'randomforest_model.pkl')

# # def train_random_forest(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path):
# model = train_random_forest(X_train_embeddings, y_train, X_val_embeddings, y_val, randomforest_model_file_path, out_base_path)

# print("Testing RF Model")
# predictions = model.predict_proba(X_test_comp_embeddings)
# output = pd.DataFrame({'id': test_essays_competition.id, 'generated': predictions[:, 1]})
# output.to_csv('submission.csv', index=False)

# print("Submission was successfully saved!")

# ============================================================================================
# CANNOT SUBMIT THIS LINEAR SVC MODEL YET
# ============================================================================================

# logger.log_message("============================================================================================")

# # Train the Linear SVC model using the train and validation sets
# linear_svc_model_file_path = os.path.join(out_base_path, 'linearsvc_model.pkl')
# model = train_linear_svc(X_train_embeddings, y_train, X_val_embeddings, y_val, linear_svc_model_file_path, out_base_path)

# # Test the Linear SVC model on the test set
# print("Testing linear SVC Model")
# test_linearsvc(X_test_embeddings, y_test, linear_svc_model_file_path)

# ============================================================================================
# IF YOU WANT TO SUBMIT THE BERT MODEL, COMMENT THIS CELL AND UNCOMMENT THE FOLLOWING CELL
# ============================================================================================

# Kết thúc theo dõi thời gian
t_end_time = time.time()
t_processing_time = t_end_time - t_start_time

# Convert minutes to hours and minutes
t_hours = int(t_processing_time // 3600)  # Lấy số giờ
t_minutes = int((t_processing_time % 3600) // 60)  # Lấy số phút
t_seconds = int(t_processing_time % 60)  # Lấy số giây

logger.log_message(f"Finished processing (total) in {t_hours} hours, {t_minutes} minutes, {t_seconds} seconds")       


2024-11-13 08:24:03,253 - INFO - Full set size: 73463
2024-11-13 08:24:03,255 - INFO - Training set size: 39375
2024-11-13 08:24:03,256 - INFO - Validation set size: 19395
2024-11-13 08:24:03,257 - INFO - Test set size: 14693
2024-11-13 08:24:03,259 - INFO - loading projection weights from /kaggle/input/pdz-dath-ds/output_w2v.txt


Full set size: 73463
Training set size: 39375
Validation set size: 19395
Test set size: 14693


2024-11-13 08:25:58,246 - INFO - KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /kaggle/input/pdz-dath-ds/output_w2v.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2024-11-13T08:25:58.246584', 'gensim': '4.3.3', 'python': '3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]', 'platform': 'Linux-5.15.154+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


Generating embeddings for train, validation, and test sets...

Applying SMOTE to balance the training set...


2024-11-13 08:29:07,301 - INFO - Training a Logistic Regression model for 20 epochs...


Label distribution after SMOTE:
Counter({0: 24636, 1: 24636})

Dataset sizes after applying SMOTE:
Balanced Training set size: 49272
Validation set size: 19395
Test set size: 14693
Model not found. Training a new model...


Epochs:   0%|          | 0/20 [00:00<?, ?epoch/s]2024-11-13 08:29:07,320 - INFO - Epoch 1/20 - Shuffling and batching data
Epochs:   5%|▌         | 1/20 [00:01<00:19,  1.02s/epoch]2024-11-13 08:29:08,343 - INFO - Epoch 2/20 - Shuffling and batching data
Epochs:  10%|█         | 2/20 [00:02<00:18,  1.01s/epoch]2024-11-13 08:29:09,344 - INFO - Epoch 3/20 - Shuffling and batching data
Epochs:  15%|█▌        | 3/20 [00:03<00:17,  1.01s/epoch]2024-11-13 08:29:10,344 - INFO - Epoch 4/20 - Shuffling and batching data
Epochs:  20%|██        | 4/20 [00:04<00:16,  1.01s/epoch]2024-11-13 08:29:11,355 - INFO - Epoch 5/20 - Shuffling and batching data
Epochs:  25%|██▌       | 5/20 [00:05<00:15,  1.02s/epoch]2024-11-13 08:29:12,405 - INFO - Epoch 6/20 - Shuffling and batching data
Epochs:  30%|███       | 6/20 [00:06<00:14,  1.04s/epoch]2024-11-13 08:29:13,489 - INFO - Epoch 7/20 - Shuffling and batching data
Epochs:  35%|███▌      | 7/20 [00:07<00:13,  1.04s/epoch]2024-11-13 08:29:14,508 - INFO - E

Testing Logistic Regression Model
Submission was successfully saved!


In [6]:
# import os
# import pandas as pd
# import numpy as np
# import tensorflow as tf
# import keras_nlp
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.optimizers import Adam

# # Set paths for Kaggle
# in_base_path = r"/kaggle/input/dath-pdz/"
# out_base_path = r"/kaggle/working/"
# model_input_path = "/kaggle/input/distil_bert/keras/distil_bert_base_en_uncased/2"

# # Load and preprocess data
# file_name = os.path.join(in_base_path, 'final_dataset_v1_afternb1.csv')
# df_ds, train_essays, test_essays, val_essays = load_data(file_name)

# # Load the competition dataset
# df_competition = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
# df_competition['processed_text'] = df_competition['text'].apply(clean_text)
# df_competition['processed_text_swr'] = df_competition['processed_text'].apply(remove_stop_words)
# X_test_comp = df_competition['processed_text_swr'].tolist()

# # Load the DistilBERT model using KerasNLP
# preset = "distil_bert_base_en_uncased"

# def train_and_evaluate_distilbert(train_essays, val_essays, model_file_path, n_epochs=1):
#     # Load the preprocessor and model from KerasNLP
#     preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset, sequence_length=160)
#     classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset, preprocessor=preprocessor, num_classes=2)

#     # Compile the model
#     optimizer = Adam(learning_rate=2e-5, weight_decay=1e-5)
#     classifier.compile(
#         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#         optimizer=optimizer,
#         metrics=["accuracy"]
#     )
    
#     # Update model file path with `.keras` extension
#     model_file_with_extension = f"{model_file_path}.keras"

#     # Check if a trained model already exists
#     if os.path.exists(model_file_with_extension):
#         print("Loading pre-trained model...")
#         classifier = tf.keras.models.load_model(model_file_with_extension)
#     else:
#         print("Training a new model...")

#         # Prepare training and validation data
#         X_train = train_essays['processed_text_swr'].tolist()
#         y_train = train_essays['label'].values
#         X_val = val_essays['processed_text_swr'].tolist()
#         y_val = val_essays['label'].values

#         # Train the model using raw text inputs (let KerasNLP handle preprocessing)
#         classifier.fit(
#             x=X_train,
#             y=y_train,
#             batch_size=32,
#             epochs=n_epochs,
#             validation_data=(X_val, y_val)
#         )

#         # Save the trained model with the `.keras` extension
#         classifier.save(f"{model_file_path}.keras")
#         print("Model saved successfully!")

#     return classifier

# # Train or load the model
# # classifier = tf.keras.models.load_model(model_file_path)
# distilbert_model_path = os.path.join(out_base_path, 'distilbert_model')
# classifier = train_and_evaluate_distilbert(train_essays, val_essays, distilbert_model_path)

# # Get predictions on the competition dataset using raw text inputs
# y_pred_logits = classifier.predict(X_test_comp)

# # Convert logits to probabilities using softmax
# y_pred_probs = tf.nn.softmax(y_pred_logits, axis=-1).numpy()

# # Extract probabilities for the second class (AI-generated)
# ai_generated_probs = y_pred_probs[:, 1]

# # Print sample probabilities
# print("Sample Predicted Probabilities:", ai_generated_probs[:10])

# # Prepare the output DataFrame for submission
# output = pd.DataFrame({'id': df_competition['id'], 'generated': ai_generated_probs})
# output.to_csv(os.path.join(out_base_path, 'submission.csv'), index=False)

# print("Submission was successfully saved!")

# Main Function

In [7]:
test_essays_competition

Unnamed: 0,id,prompt_id,text,processed_text,processed_text_swr
0,0000aaaa,2,Aaa bbb ccc.,aaa bbb ccc,aaa bbb ccc
1,1111bbbb,3,Bbb ccc ddd.,bbb ccc ddd,bbb ccc ddd
2,2222cccc,4,CCC ddd eee.,ccc ddd eee,ccc ddd eee


In [8]:
X_test_comp_embeddings

array([[-3.16240229e-02, -5.43002225e-02, -3.26591767e-02,
         7.19418377e-02, -1.03809163e-01,  3.37816961e-02,
        -1.48627302e-02,  4.55846786e-02, -7.42680058e-02,
         1.14390205e-04, -3.09963105e-03,  3.17860804e-02,
         8.05525575e-03, -1.87187977e-02, -1.64997950e-02,
        -8.53928328e-02, -7.34825358e-02,  1.14354268e-02,
        -4.15860377e-02, -3.43110040e-02, -3.36581767e-02,
        -2.76853051e-02,  5.85033260e-02, -2.82382015e-02,
         1.11086778e-01, -8.38946998e-02,  1.47264050e-02,
         1.31532460e-01, -3.75723243e-02,  1.71150658e-02,
        -6.83340356e-02,  4.04334329e-02,  1.11439683e-01,
        -4.79017161e-02,  7.10884780e-02,  8.10474902e-02,
        -9.55445021e-02, -6.56884164e-02, -1.88885760e-02,
        -7.72663504e-02,  2.02020146e-02,  3.85955907e-02,
        -1.72143225e-02,  1.04575038e-01, -1.05291888e-01,
        -6.40322939e-02,  1.79702658e-02, -6.32659420e-02,
        -4.21437174e-02, -1.15867026e-01,  2.44418122e-0

In [9]:
# !rm -rf /kaggle/working/*

In [10]:
# !zip -r file.zip /kaggle/working

# End