# Install Package

In [None]:
from tqdm import tqdm, from os import time, re, joblib, torch, logging, json, string, nltk, seaborn as sns
from imblearn.over_sampling import SMOTE
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, log_loss, hinge_loss
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from collections import Counter
from scipy.stats import randint


# Cute processing functions

In [None]:
# ===========================================================================================
# Tạo class Logger
class MyLogger:
    """
    Custom logger class that logs messages to both a file and the console.
    """
    def __init__(self, log_file='app.log'):
        """
        Initialize the logger with a log file.
        Parameters:
        - log_file (str): Path to the log file. Default is 'app.log'.
        """
        self.log_file = log_file
        self._initialize_logger()

    def _initialize_logger(self):
        """
        Set up the logger by creating file and console handlers.
        If the log file already exists, logs are appended to it.
        """
        # Set file mode based on whether the log file already exists
        if os.path.exists(self.log_file):
            file_mode = 'a'
        else:
            file_mode = 'w'

        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)

        # Set up file handler
        file_handler = logging.FileHandler(self.log_file, mode=file_mode, encoding='utf-8')
        file_handler.setLevel(logging.INFO)

        # Set up console handler
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)

        # Define log format
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        console_handler.setFormatter(formatter)

        # Clear existing handlers to prevent duplicates
        if self.logger.hasHandlers():
            self.logger.handlers.clear()

        # Add handlers to the logger
        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)

    def log_message(self, message):
        """Log an informational message."""
        self.logger.info(message)

    def change_log_file(self, new_log_file):
        """
        Change the log file and reinitialize the logger.
        Parameters:
        - new_log_file (str): Path to the new log file.
        """
        self.log_file = new_log_file
        self._initialize_logger()

# ============================================================================================
# danh sách các biến toàn cục
logger = MyLogger()

# Enable/Disable tokenizers parallelism to avoid the warning
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# ============================================================================================
# Danh sách các hàm xử lý
# ============================================================================================
# Load the stop words
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stop_words(text):
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the filtered words back into a string
    return ' '.join(filtered_words)

def clean_text(text):
    # Step 1: Remove URLs
    text = re.sub(r'http\S+|https?://\S+|www\.\S+', '', text)
    
    # Step 2: Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Step 3: Remove angle brackets
    text = re.sub(r'<.*?>+', '', text)
    
    # Step 4: Remove newlines, tabs, carriage returns, form feeds, backspace characters
    text = re.sub(r'\n|\t|\r|\f|\b', '', text)
    
    # Step 5: Remove words that contain numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Step 6: Remove any non-alphanumeric characters, then make lowercase
    text = re.sub(r'\W+', ' ', text).lower().strip()
    
    # Step 7: Tokenize the English text using NLTK
    tokens = word_tokenize(text)
    
    # Join tokens back into a string, if desired (for a clean, tokenized version of the text)
    text = ' '.join(tokens)

    return text

# Fix this load and separate dataset if it is imbalanced
def load_data(file_path):
    # Load the dataset
    df_ds = pd.read_csv(file_path)

    # Check if data is imbalanced for binary classification
    label_counts = df_ds['label'].value_counts()
    print("Label distribution before balancing:")
    print(label_counts)

    # Define imbalance threshold, e.g., if one class has less than 50% of the other class
    imbalance_threshold = 0.5
    minority_class_ratio = label_counts.min() / label_counts.max()

    if minority_class_ratio < imbalance_threshold:
        print("Data is imbalanced. Applying SMOTE to balance the dataset...")

        # Separate features and target variable
        X = df_ds.drop(columns=['label'])
        y = df_ds['label']
        
        # Apply SMOTE to balance the classes
        smote = SMOTE(random_state=42)
        X_balanced, y_balanced = smote.fit_resample(X, y)
        
        # Convert the balanced data back to a DataFrame
        df_ds_balanced = pd.DataFrame(X_balanced, columns=X.columns)
        df_ds_balanced['label'] = y_balanced
        
        print("Label distribution after balancing:")
        print(df_ds_balanced['label'].value_counts())
    else:
        print("Data is already balanced. Proceeding without SMOTE.")
        df_ds_balanced = df_ds

    # Split the data into train, validation, and test sets
    # First, split into train and test sets (80% train, 20% test)
    train_essays, test_essays = train_test_split(df_ds_balanced, test_size=0.2, random_state=42)

    # Then, split the train set into train and validation sets (67% train, 33% validation of the training data)
    train_essays, val_essays = train_test_split(train_essays, test_size=0.33, random_state=42)
    
    return df_ds_balanced, train_essays, test_essays, val_essays

def compute_metrics(preds, labels):
    # Convert probabilities to binary predictions
    binary_preds = (preds >= 0.5).astype(int)

    # Compute ROC AUC score
    auc = roc_auc_score(labels, preds)

    # Other metrics with zero_division set to handle undefined metrics
    accuracy = accuracy_score(labels, binary_preds)
    precision = precision_score(labels, binary_preds, zero_division=0)  # Use zero_division=0 to avoid warnings
    recall = recall_score(labels, binary_preds)
    f1 = f1_score(labels, binary_preds, zero_division=0)  # Use zero_division=0 to avoid warnings

    return {"roc_auc": auc, "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def compute_metrics_fix_0911(preds, labels):
    # Extract the probabilities for the positive class (class 1)
    if preds.ndim == 2:
        preds = preds[:, 1]  # Use the second column if preds is (num_samples, 2)

    # Convert probabilities to binary predictions (0 or 1) based on a threshold of 0.5
    binary_preds = (preds >= 0.5).astype(int)

    # Compute ROC AUC score
    auc = roc_auc_score(labels, preds)

    # Other metrics with zero_division set to handle undefined metrics
    accuracy = accuracy_score(labels, binary_preds)
    precision = precision_score(labels, binary_preds, zero_division=0)
    recall = recall_score(labels, binary_preds, zero_division=0)
    f1 = f1_score(labels, binary_preds, zero_division=0)

    return {
        'roc_auc': auc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def compute_metrics_bert(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities using softmax
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    
    # Get the predicted class by selecting the class with the highest probability
    preds = np.argmax(probs, axis=-1)
    
    # Compute ROC AUC score using probabilities of the positive class (class 1)
    if len(np.unique(labels)) > 1:  # Ensure there are both classes present for AUC computation
        auc = roc_auc_score(labels, probs[:, 1])
    else:
        auc = float('nan')  # AUC is not defined if only one class is present
    
    # Compute other metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, zero_division=0)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds, zero_division=0)

    return {"roc_auc": auc, "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def plot_tsne(model, num):
    labels = []
    tokens = []
    for word in model.key_to_index:
        if word not in stop_words:
            tokens.append(np.array(model[word]))
            labels.append(word)
    tsne = TSNE(perplexity = 40, n_components = 2, init = 'pca', n_iter = 2500, random_state = 23)
    data = tsne.fit_transform(np.array(tokens[:num]))
    x = []
    y = []
    for each in data:
        x.append(each[0])
        y.append(each[1])
    plt.figure(figsize = (10, 10))
    for i in range(num):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy = (x[i], y[i]),
                     xytext = (5,2),
                     textcoords = 'offset points',
                     ha = 'right',
                     va = 'bottom')
    plt.show()

# Assuming glove_model is already loaded in your environment
# Function to convert a sentence to a vector
def sent2vec(s, glove_model):
    words = str(s).lower()
    words = word_tokenize(words)  # This requires the 'punkt' tokenizer
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if w.isalpha()]  # Filter out non-alphabetic tokens
    M = []
    for w in words:
        try:
            M.append(glove_model[w])  # Lookup word in GloVe model
        except KeyError:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)  # Return a zero vector if no word embeddings are found
    return v / np.sqrt((v ** 2).sum())  # Normalize the vector

# ============================================================================================
def plot_training_validation_curves(train_metrics, val_metrics, save_path, file_name):
    """
    Function to plot training and validation metrics over epochs.
    Supports Accuracy, Loss, AUC, Precision, Recall, and F1-score.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Check if metrics contain only one value (single-point data)
    single_point = len(next(iter(train_metrics.values()))) == 1

    if single_point:
        # Bar plot for single-point metrics (if only one epoch was run)
        metrics = []
        train_values = []
        val_values = []

        # Collect metrics for bar plot
        for metric in ['accuracy', 'loss', 'roc_auc', 'precision', 'recall', 'f1']:
            if metric in train_metrics and metric in val_metrics:
                metrics.append(metric.capitalize())
                train_values.append(train_metrics[metric][0])
                val_values.append(val_metrics[metric][0])

        # Create bar plot
        x = range(len(metrics))
        plt.figure(figsize=(10, 5))
        plt.bar(x, train_values, width=0.4, label='Train', align='center')
        plt.bar([i + 0.4 for i in x], val_values, width=0.4, label='Validation', align='center')
        plt.xticks(x, metrics)
        plt.ylabel('Score')
        plt.title('Training and Validation Metrics')
        plt.legend()
        plt.show()

    else:
        # Line plot for metrics over multiple epochs
        metrics_to_plot = ['accuracy', 'loss', 'roc_auc', 'precision', 'recall', 'f1']
        metric_names = {
            'accuracy': 'Accuracy',
            'loss': 'Loss',
            'roc_auc': 'AUC',
            'precision': 'Precision',
            'recall': 'Recall',
            'f1': 'F1 Score'
        }
        
        plt.figure(figsize=(15, 10))
        
        # Dynamically plot all available metrics
        for i, metric in enumerate(metrics_to_plot, start=1):
            if metric in train_metrics and metric in val_metrics:
                plt.subplot(3, 2, i)
                plt.plot(train_metrics[metric], label=f'Train {metric_names[metric]}')
                plt.plot(val_metrics[metric], label=f'Validation {metric_names[metric]}')
                plt.title(f'{metric_names[metric]} over epochs')
                plt.xlabel('Epochs')
                plt.ylabel(metric_names[metric])
                plt.legend()
                plt.grid(True)

        plt.tight_layout()

    # Save the plot to the specified directory with the specified file name
    plot_path = os.path.join(save_path, file_name)
    plt.savefig(plot_path)
    plt.show()
    
    print(f"Plot saved to: {plot_path}")

# Custom function to calculate binary cross-entropy loss
def binary_cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15  # Small constant to avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip predictions to avoid issues with log(0)
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

def test_model(X_test_embeddings, y_test, model_file_path):
    # Load the trained model
    model = joblib.load(model_file_path)
    
    # Predict probabilities
    preds_test = model.predict_proba(X_test_embeddings)[:, 1]
    
    # Compute test metrics
    test_loss = binary_cross_entropy_loss(y_test, preds_test)
    test_metrics = compute_metrics(preds_test, y_test)

    # Log test metrics
    logger.log_message("Testing Results:")
    logger.log_message(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_metrics['accuracy']:.4f}, ")
    logger.log_message(f"Test ROC AUC: {test_metrics['roc_auc']:.4f}, Precision: {test_metrics['precision']:.4f}, ")
    logger.log_message(f"Recall: {test_metrics['recall']:.4f}, F1: {test_metrics['f1']:.4f}")
    
    # Print test metrics for console
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_metrics['accuracy']:.4f}, "
          f"Test ROC AUC: {test_metrics['roc_auc']:.4f}, Precision: {test_metrics['precision']:.4f}, "
          f"Recall: {test_metrics['recall']:.4f}, F1: {test_metrics['f1']:.4f}")
    
    # Return all test metrics
    return {"loss": test_loss, **test_metrics}

def test_linearsvc(X_test_embeddings, y_test, model_file_path): 
    # Load the trained model
    model = joblib.load(model_file_path)
    
    # Predict decision function values (not probabilities) for the test set
    preds_test = model.decision_function(X_test_embeddings)

    # Calculate hinge loss (suitable for SVM-based models like LinearSVC)
    test_loss = hinge_loss(y_test, preds_test)
    
    # Convert decision function values to binary predictions (-1, 1 -> 0, 1)
    preds_test_binary = (preds_test > 0).astype(int)

    # Compute test metrics
    test_accuracy = accuracy_score(y_test, preds_test_binary)
    test_precision = precision_score(y_test, preds_test_binary)
    test_recall = recall_score(y_test, preds_test_binary)
    test_f1 = f1_score(y_test, preds_test_binary)
    test_roc_auc = roc_auc_score(y_test, preds_test_binary)

    # Log test metrics
    logger.log_message("Testing Results:")
    logger.log_message(f"Test Loss (Hinge Loss): {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, ")
    logger.log_message(f"Test ROC AUC: {test_roc_auc:.4f}, Precision: {test_precision:.4f}, ")
    logger.log_message(f"Recall: {test_recall:.4f}, F1: {test_f1:.4f}")
    
    # Print test metrics for console
    print(f"Test Loss (Hinge Loss): {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, "
          f"Test ROC AUC: {test_roc_auc:.4f}, Precision: {test_precision:.4f}, "
          f"Recall: {test_recall:.4f}, F1: {test_f1:.4f}")
    
    # Return all test metrics
    return {
        "loss": test_loss,
        "accuracy": test_accuracy,
        "precision": test_precision,
        "recall": test_recall,
        "f1": test_f1,
        "roc_auc": test_roc_auc
    }

def test_rnn(X_test, y_test, model_file_path):
    """
    Function to test a trained RNN model using TensorFlow/Keras.
    It loads the model, performs predictions, and computes metrics.
    """
    # Load the trained model
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        model = tf.keras.models.load_model(model_file_path)
    else:
        raise FileNotFoundError(f"Model file not found at {model_file_path}")

    # Create a TensorFlow dataset for testing
    batch_size = 32
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)

    # Predict logits (raw output before activation)
    preds_test_logits = model.predict(test_dataset)

    # Convert logits to probabilities using sigmoid since we are doing binary classification
    preds_test = tf.sigmoid(preds_test_logits).numpy().flatten()

    # Compute test loss using log loss (cross-entropy)
    test_loss = log_loss(y_test, preds_test)

    # Compute test metrics (precision, recall, F1, etc.)
    test_metrics = compute_metrics(preds_test, y_test)

    # Log test metrics
    logger.log_message("Testing Results:")
    logger.log_message(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_metrics['accuracy']:.4f}, ")
    logger.log_message(f"Test ROC AUC: {test_metrics['roc_auc']:.4f}, Precision: {test_metrics['precision']:.4f}, ")
    logger.log_message(f"Recall: {test_metrics['recall']:.4f}, F1: {test_metrics['f1']:.4f}")

    # Print test metrics to console
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_metrics['accuracy']:.4f}, "
          f"Test ROC AUC: {test_metrics['roc_auc']:.4f}, Precision: {test_metrics['precision']:.4f}, "
          f"Recall: {test_metrics['recall']:.4f}, F1: {test_metrics['f1']:.4f}")

    # Return test loss and all test metrics
    return {"loss": test_loss, **test_metrics}

def test_distilbert(X_test, model_file_path):
    """
    Function to test a DistilBERT model on the test dataset and print metrics.
    """
    # Load the trained model and tokenizer
    if os.path.exists(model_file_path):
        print("Loading model and tokenizer from file...")
        model = AutoModelForSequenceClassification.from_pretrained(model_file_path)
        tokenizer = AutoTokenizer.from_pretrained(model_file_path)
    else:
        raise FileNotFoundError(f"Model file not found at {model_file_path}")

    # Create the test dataset
    test_dataset = Dataset.from_pandas(X_test)

    # Tokenize the test dataset
    tokenized_test = test_dataset.map(lambda examples: tokenizer(examples['processed_text_swr'], max_length=128, padding=True, truncation=True), batched=True)

    # Initialize the Trainer (no training arguments needed for testing)
    trainer = Trainer(model=model, tokenizer=tokenizer)

    # Perform predictions on the test set
    predictions = trainer.predict(tokenized_test)

    # Extract the predictions and labels
    preds_test = predictions.predictions
    labels_test = predictions.label_ids

    # Compute metrics for the test set using the logits and labels
    test_metrics = compute_metrics_fix_0911(preds_test, labels_test)

    # Log and print the test metrics
    logger.log_message("Testing Results:")
    logger.log_message(f"Test Accuracy: {test_metrics['accuracy']:.4f}, Test ROC AUC: {test_metrics['roc_auc']:.4f}, "
                       f"Precision: {test_metrics['precision']:.4f}, Recall: {test_metrics['recall']:.4f}, F1: {test_metrics['f1']:.4f}")

    print(f"Test Accuracy: {test_metrics['accuracy']:.4f}, Test ROC AUC: {test_metrics['roc_auc']:.4f}, "
          f"Precision: {test_metrics['precision']:.4f}, Recall: {test_metrics['recall']:.4f}, F1: {test_metrics['f1']:.4f}")

    return test_metrics

# ============================================================================================
def shuffle_and_split(X, y, batch_size):
    """Shuffle the data and split into batches."""
    indices = np.random.permutation(len(y))  # Shuffle the indices
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    
    # Divide into batches
    num_batches = len(y) // batch_size
    X_batches = np.array_split(X_shuffled, num_batches)
    y_batches = np.array_split(y_shuffled, num_batches)
    
    return X_batches, y_batches
# ============================================================================================
def train_logistic_regression(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path, n_epochs=20, batch_size=32):
    logger.log_message(f"Training a Logistic Regression model for {n_epochs} epochs...")

    # Check if model already exists
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        model = joblib.load(model_file_path)
        print("Model loaded. Skipping training and evaluating on train/validation sets.")
        
        # Evaluate the loaded model without retraining
        preds_train = model.predict_proba(X_train_embeddings)[:, 1]
        preds_val = model.predict_proba(X_val_embeddings)[:, 1]
        
        train_loss = binary_cross_entropy_loss(y_train, preds_train)
        val_loss = binary_cross_entropy_loss(y_val, preds_val)

        train_metrics = compute_metrics(preds_train, y_train)
        val_metrics = compute_metrics(preds_val, y_val)

        print(f"Train Loss: {train_loss:.4f}, Train Metrics: {train_metrics}")
        print(f"Validation Loss: {val_loss:.4f}, Validation Metrics: {val_metrics}")

        # No training occurred, but we can plot metrics for evaluation only if needed
        return model

    # If model does not exist, train a new one
    print("Model not found. Training a new model...")
    model = SGDClassifier(loss='log_loss', max_iter=1, warm_start=True)

    # Calculate class weights
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    class_weight_dict = {i: weight for i, weight in zip(classes, class_weights)}

    # train_metrics = {'accuracy': [], 'loss': [], 'roc_auc': [], 'precision': [], 'recall': [], 'f1': []}
    # val_metrics = {'accuracy': [], 'loss': [], 'roc_auc': [], 'precision': [], 'recall': [], 'f1': []}

    for epoch in tqdm(range(n_epochs), desc="Epochs", unit="epoch"):
        logger.log_message(f"Epoch {epoch + 1}/{n_epochs} - Shuffling and batching data")

        X_train_batches, y_train_batches = shuffle_and_split(X_train_embeddings, y_train, batch_size)

        for X_batch, y_batch in zip(X_train_batches, y_train_batches):
            # Calculate sample weights for each batch based on class weights
            sample_weights = np.array([class_weight_dict[label] for label in y_batch])
            model.partial_fit(X_batch, y_batch, classes=classes, sample_weight=sample_weights)

        # Evaluate on the full training and validation sets
        # preds_train = model.predict_proba(X_train_embeddings)[:, 1]
        # preds_val = model.predict_proba(X_val_embeddings)[:, 1]

        # # Calculate and log metrics
        # train_loss = binary_cross_entropy_loss(y_train, preds_train)
        # train_metrics_epoch = compute_metrics(preds_train, y_train)
        # train_metrics['loss'].append(train_loss)
        # for key, value in train_metrics_epoch.items():
        #     train_metrics[key].append(value)

        # val_loss = binary_cross_entropy_loss(y_val, preds_val)
        # val_metrics_epoch = compute_metrics(preds_val, y_val)
        # val_metrics['loss'].append(val_loss)
        # for key, value in val_metrics_epoch.items():
        #     val_metrics[key].append(value)

        # logger.log_message(f"Train Loss: {train_loss:.4f}, Train Metrics: {train_metrics_epoch}")
        # logger.log_message(f"Val Loss: {val_loss:.4f}, Val Metrics: {val_metrics_epoch}")

    # Save the trained model
    # joblib.dump(model, model_file_path)
    # print(f"Model saved to {model_file_path}")

    # Plot training and validation curves
    # plot_training_validation_curves(train_metrics, val_metrics, out_base_path, "LogisticRegressionAnalysis.png")

    return model

# ============================================
def train_xgboost(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path, n_estimators=100, learning_rate=0.01):
    logger.log_message(f"Training an XGBoost model with {n_estimators} boosting rounds and learning rate {learning_rate}...")

    # Calculate scale_pos_weight for imbalanced dataset
    counter = Counter(y_train)
    scale_pos_weight = counter[0] / counter[1]
    logger.log_message(f"Calculated scale_pos_weight: {scale_pos_weight}")

    # Check if model already exists
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        logger.log_message("Loading model from file...")
        model = joblib.load(model_file_path)
    else:
        print("Model not found. Training a new model...")
        logger.log_message("Model not found. Training a new model...")

        # Define the XGBoost model with supported evaluation metrics
        model = xgb.XGBClassifier(
            use_label_encoder=False,
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            eval_metric=['auc', 'logloss', 'error'],  # Supported metrics
            objective='binary:logistic',
            scale_pos_weight=scale_pos_weight,
            max_delta_step=1
        )

        # Train the model with evaluation on the validation set
        model.fit(
            X_train_embeddings, y_train,
            eval_set=[(X_train_embeddings, y_train), (X_val_embeddings, y_val)],
            early_stopping_rounds=20, verbose=True
        )

        # Retrieve evaluation results
        results = model.evals_result()
        boosting_rounds = range(len(results['validation_0']['auc']))

        # Extract metrics and plot them
        # metrics_to_plot = ['auc', 'logloss', 'error']
        # plt.figure(figsize=(15, 5))
        # for i, metric in enumerate(metrics_to_plot, start=1):
        #     plt.subplot(1, 3, i)
        #     plt.plot(boosting_rounds, results['validation_0'][metric], label=f'Train {metric.capitalize()}')
        #     plt.plot(boosting_rounds, results['validation_1'][metric], label=f'Validation {metric.capitalize()}')
        #     plt.title(f'{metric.capitalize()} over Boosting Rounds')
        #     plt.xlabel('Boosting Rounds')
        #     plt.ylabel(metric.capitalize())
        #     plt.legend()
        #     plt.grid(True)

        # plt.tight_layout()
        
        # # Save the plot
        # plot_file_path = os.path.join(out_base_path, 'xgboost_training_plot.png')
        # plt.savefig(plot_file_path)
        # plt.show()
        # print(f"Training plot saved to {plot_file_path}")
        # logger.log_message(f"Training plot saved to {plot_file_path}")

        # Save the trained model
        joblib.dump(model, model_file_path)
        print(f"Model saved to {model_file_path}")
        logger.log_message(f"Model saved to {model_file_path}")

    # # Evaluate additional metrics manually
    # preds_train = model.predict_proba(X_train_embeddings)[:, 1]
    # preds_val = model.predict_proba(X_val_embeddings)[:, 1]

    # # Convert probabilities to binary predictions
    # train_preds_binary = (preds_train >= 0.5).astype(int)
    # val_preds_binary = (preds_val >= 0.5).astype(int)

    # # Compute metrics
    # train_f1 = f1_score(y_train, train_preds_binary)
    # train_precision = precision_score(y_train, train_preds_binary)
    # train_recall = recall_score(y_train, train_preds_binary)

    # val_f1 = f1_score(y_val, val_preds_binary)
    # val_precision = precision_score(y_val, val_preds_binary)
    # val_recall = recall_score(y_val, val_preds_binary)

    # print(f"Train F1: {train_f1:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}")
    # print(f"Validation F1: {val_f1:.4f}, Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}")

    return model

# ============================================
def train_random_forest(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path):
    logger.log_message("Training a Random Forest model with hyperparameter tuning...")

    if os.path.exists(model_file_path):
        print("Loading model from file...")
        model = joblib.load(model_file_path)
        print("Model loaded. Skipping hyperparameter tuning and metric plotting.")
        return model
    else:
        print("Model not found. Performing hyperparameter tuning...")
        
        # Define the parameter distribution
        param_dist = {
            'n_estimators': randint(50, 500),
            'max_depth': randint(1, 20)
        }

        # Create a random forest classifier
        rf = RandomForestClassifier(random_state=42)

        # Use RandomizedSearchCV to find the best hyperparameters
        rand_search = RandomizedSearchCV(
            rf,
            param_distributions=param_dist,
            n_iter=5,
            cv=5,
            scoring='roc_auc',
            random_state=42,
            n_jobs=-1
        )

        # Fit the random search object to the training data
        rand_search.fit(X_train_embeddings, y_train)

        # Get the best model from the random search
        model = rand_search.best_estimator_
        print(f"Best parameters found: {rand_search.best_params_}")

        # Train the model on the full training data
        model.fit(X_train_embeddings, y_train)

        # Evaluate the model on the training and validation sets
        preds_train = model.predict_proba(X_train_embeddings)[:, 1]
        preds_val = model.predict_proba(X_val_embeddings)[:, 1]

        train_loss = log_loss(y_train, preds_train)
        train_auc = roc_auc_score(y_train, preds_train)
        train_accuracy = accuracy_score(y_train, (preds_train >= 0.5).astype(int))
        val_loss = log_loss(y_val, preds_val)
        val_auc = roc_auc_score(y_val, preds_val)
        val_accuracy = accuracy_score(y_val, (preds_val >= 0.5).astype(int))

        print(f"Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}, Val Accuracy: {val_accuracy:.4f}")

        joblib.dump(model, model_file_path)
        print(f"Model saved to {model_file_path}")
        logger.log_message(f"Model saved to {model_file_path}")

        # # Plot learning curves for AUC and Accuracy
        # plot_learning_curve(model, X_train_embeddings, y_train, cv=5, out_base_path=out_base_path)

        # # Plot validation curve for 'n_estimators'
        # param_range = [50, 100, 150, 200, 300, 400, 500]
        # plot_validation_curve(model, X_train_embeddings, y_train, param_name='n_estimators',
        #                       param_range=param_range, scoring='roc_auc', cv=5, out_base_path=out_base_path)

    return model

# ============================================
def train_linear_svc(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path):
    logger.log_message("Training a LinearSVC model...")

    if os.path.exists(model_file_path):
        print("Loading model from file...")
        model = joblib.load(model_file_path)
    else:
        print("Model not found. Training a new model...")
        model = LinearSVC(max_iter=246, class_weight='balanced', tol=1e-4, C=1.0)
        model.fit(X_train_embeddings, y_train)

        # After training, calculate metrics on training and validation sets
        # preds_train = model.decision_function(X_train_embeddings)
        # preds_val = model.decision_function(X_val_embeddings)

        # train_loss = hinge_loss(y_train, preds_train)
        # val_loss = hinge_loss(y_val, preds_val)
        # train_metrics_epoch = compute_metrics(preds_train, y_train)
        # val_metrics_epoch = compute_metrics(preds_val, y_val)

        # print(f"Train Loss: {train_loss:.4f}, Train Metrics: {train_metrics_epoch}")
        # print(f"Validation Loss: {val_loss:.4f}, Validation Metrics: {val_metrics_epoch}")

        joblib.dump(model, model_file_path)
        print(f"Model saved to {model_file_path}")
        logger.log_message(f"Model saved to {model_file_path}")

        # # Plot learning curves for AUC and Accuracy
        # plot_learning_curve(model, X_train_embeddings, y_train, cv=5, out_base_path=out_base_path)

        # # Plot validation curve for 'C'
        # param_range = [0.01, 0.1, 1, 10, 100]
        # plot_validation_curve(model, X_train_embeddings, y_train, param_name='C',
        #                       param_range=param_range, scoring='roc_auc', cv=5, out_base_path=out_base_path)

    return model

# ============================================
def f1_score_cal(precision, recall):
    """Calculate F1 score given precision and recall."""
    return 2 * (precision * recall) / (precision + recall + 1e-7)

def train_rnn(X_train, y_train, X_val, y_val, model_file_path, plot_file_path, n_epochs=20, batch_size=32):
    """
    Function to train a Bidirectional LSTM RNN using TensorFlow/Keras.
    It checks if the model already exists, otherwise it trains and saves the model.
    """
    # Create TensorFlow datasets from the input data
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)

    # Check if the model already exists
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        logger.log_message("Loading model from file...")
        model = tf.keras.models.load_model(model_file_path)
    else:
        print("Model not found. Training a new model...")
        logger.log_message("Model not found. Training a new model...")

        # Text vectorization layer
        VOCAB_SIZE = 1000
        encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
        encoder.adapt(train_dataset.map(lambda text, label: text))

        # Define the Bidirectional LSTM RNN model (using the old configuration)
        model = tf.keras.Sequential([
            encoder,
            tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)  # No sigmoid activation here, using logits
        ])

        # Compile the model with `from_logits=True` to handle raw logits
        model.compile(
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            optimizer=tf.keras.optimizers.Adam(1e-4),
            metrics=['accuracy', tf.keras.metrics.AUC(name='auc'), 
                     tf.keras.metrics.Precision(name='precision'), 
                     tf.keras.metrics.Recall(name='recall')]
        )

        # Train the model
        history = model.fit(train_dataset, epochs=n_epochs, validation_data=val_dataset)

        # Save the model
        model.save(model_file_path)
        print(f"Model saved to {model_file_path}")
        logger.log_message(f"Model saved to {model_file_path}")

        # Log and print the metrics before plotting
        print("\n--- Training Metrics ---")
        logger.log_message("\n--- Training Metrics ---")
        for epoch in range(n_epochs):
            print(f"Epoch {epoch + 1}/{n_epochs}")
            logger.log_message(f"Epoch {epoch + 1}/{n_epochs}")
            
            accuracy = history.history['accuracy'][epoch]
            val_accuracy = history.history['val_accuracy'][epoch]
            loss = history.history['loss'][epoch]
            val_loss = history.history['val_loss'][epoch]
            auc = history.history['auc'][epoch]
            val_auc = history.history['val_auc'][epoch]
            precision = history.history['precision'][epoch]
            val_precision = history.history['val_precision'][epoch]
            recall = history.history['recall'][epoch]
            val_recall = history.history['val_recall'][epoch]
            f1 = f1_score_cal(precision, recall)
            val_f1 = f1_score_cal(val_precision, val_recall)

            print(f"Accuracy: {accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")
            logger.log_message(f"Accuracy: {accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")
            
            print(f"Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}")
            logger.log_message(f"Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}")
            
            print(f"AUC: {auc:.4f}, Validation AUC: {val_auc:.4f}")
            logger.log_message(f"AUC: {auc:.4f}, Validation AUC: {val_auc:.4f}")

            print(f"Precision: {precision:.4f}, Validation Precision: {val_precision:.4f}")
            logger.log_message(f"Precision: {precision:.4f}, Validation Precision: {val_precision:.4f}")
            
            print(f"Recall: {recall:.4f}, Validation Recall: {val_recall:.4f}")
            logger.log_message(f"Recall: {recall:.4f}, Validation Recall: {val_recall:.4f}")
            
            print(f"F1 Score: {f1:.4f}, Validation F1 Score: {val_f1:.4f}")
            logger.log_message(f"F1 Score: {f1:.4f}, Validation F1 Score: {val_f1:.4f}")
            print()

        # # Function to plot metrics
        # def plot_graphs(history, metric):
        #     plt.plot(history.history[metric])
        #     plt.plot(history.history['val_' + metric], '')
        #     plt.xlabel("Epochs")
        #     plt.ylabel(metric.capitalize())
        #     plt.legend([metric, 'val_' + metric])
        #     plt.grid(True)

        # # Plot training and validation metrics
        # plt.figure(figsize=(18, 12))

        # # Plot accuracy
        # plt.subplot(2, 3, 1)
        # plot_graphs(history, 'accuracy')
        # plt.ylim(0, 1)
        # plt.title('Accuracy over Epochs')

        # # Plot loss
        # plt.subplot(2, 3, 2)
        # plot_graphs(history, 'loss')
        # plt.ylim(0, None)
        # plt.title('Loss over Epochs')

        # # Plot AUC
        # plt.subplot(2, 3, 3)
        # plot_graphs(history, 'auc')
        # plt.ylim(0, 1)
        # plt.title('AUC over Epochs')

        # # Plot precision
        # plt.subplot(2, 3, 4)
        # plot_graphs(history, 'precision')
        # plt.ylim(0, 1)
        # plt.title('Precision over Epochs')

        # # Plot recall
        # plt.subplot(2, 3, 5)
        # plot_graphs(history, 'recall')
        # plt.ylim(0, 1)
        # plt.title('Recall over Epochs')

        # # Plot F1 score (calculated manually)
        # f1_history = [f1_score_cal(history.history['precision'][i], history.history['recall'][i]) for i in range(n_epochs)]
        # val_f1_history = [f1_score_cal(history.history['val_precision'][i], history.history['val_recall'][i]) for i in range(n_epochs)]
        # plt.subplot(2, 3, 6)
        # plt.plot(f1_history)
        # plt.plot(val_f1_history, '')
        # plt.xlabel("Epochs")
        # plt.ylabel("F1 Score")
        # plt.legend(['F1 Score', 'Val F1 Score'])
        # plt.ylim(0, 1)
        # plt.title('F1 Score over Epochs')

        # plt.tight_layout()

        # # Save the plot
        # plt.savefig(plot_file_path)
        # print(f"Training plot saved to {plot_file_path}")
        # logger.log_message(f"Training plot saved to {plot_file_path}")
        # plt.show()

    return model

# ============================================
def preprocess_function(examples, tokenizer):
    return tokenizer(examples["processed_text_swr"], max_length=128, padding=True, truncation=True)
    
def plot_graphs(log_history, metric):
    """
    Function to plot both training and evaluation metrics.
    """
    epochs = [entry['epoch'] for entry in log_history if metric in entry or f"train_{metric}" in entry]
    eval_metric_values = [entry[metric] for entry in log_history if metric in entry]
    train_metric_values = [entry[f"train_{metric}"] for entry in log_history if f"train_{metric}" in entry]

    # Plot both training and evaluation metrics
    plt.plot(epochs[:len(train_metric_values)], train_metric_values, label=f"Train {metric}")
    plt.plot(epochs[:len(eval_metric_values)], eval_metric_values, label=f"Eval {metric}")
    plt.xlabel("Epochs")
    plt.ylabel(metric.capitalize())
    plt.title(f"{metric.capitalize()} over Epochs")
    plt.legend()
    plt.grid(True)

import os
import pandas as pd
import tensorflow as tf
import keras_nlp
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

def train_distilbert(df_ds, model_file_path, n_epochs=20):
    # Define the path to your pre-trained model on Kaggle
    model_input_path = "/kaggle/input/distil_bert/keras/distil_bert_base_en_uncased/2"

    # Load the KerasNLP DistilBERT preset
    preset = "distil_bert_base_en_uncased"

    # Step 1: Set up the preprocessor
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
        preset,
        sequence_length=160,
        name="preprocessor_for_distilbert"
    )

    # Step 2: Load the pre-trained DistilBERT model using KerasNLP
    classifier = keras_nlp.models.DistilBertClassifier.from_preset(
        preset,
        preprocessor=preprocessor,
        num_classes=2  # Adjust this based on your dataset
    )

    # Compile the model
    optimizer = Adam(learning_rate=2e-5, weight_decay=1e-5)
    classifier.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizer,
        metrics=["accuracy"]
    )

    # Check if a saved model exists; if so, load it
    if os.path.exists(model_file_path):
        print("Loading model from file...")
        classifier = tf.keras.models.load_model(model_file_path)
        print("Model loaded successfully!")
    else:
        print("Model not found. Training a new model...")

        # Step 3: Split the dataset into training, validation, and test sets
        train_essays, test_essays = train_test_split(df_ds, test_size=0.2, random_state=42)
        train_essays, val_essays = train_test_split(train_essays, test_size=0.33, random_state=42)

        # Step 4: Preprocess the text data
        def preprocess_texts(dataframe):
            return preprocessor(dataframe['text'].tolist())

        X_train = preprocess_texts(train_essays)
        y_train = train_essays['label'].values
        X_val = preprocess_texts(val_essays)
        y_val = val_essays['label'].values
        X_test = preprocess_texts(test_essays)
        y_test = test_essays['label'].values

        # Step 5: Train the model
        history = classifier.fit(
            x=X_train,
            y=y_train,
            batch_size=32,
            epochs=n_epochs,
            validation_data=(X_val, y_val)
        )

        # Step 6: Save the trained model
        classifier.save(model_file_path)
        print("Model saved successfully!")

    # Evaluate the model on the test set
    test_loss, test_accuracy = classifier.evaluate(X_test, y_test)
    print(f"Test Accuracy: {test_accuracy:.4f}")

    return classifier

def evaluate_trained_model(model_file_path, df_ds):
    # Load model and tokenizer from the saved path
    model = AutoModelForSequenceClassification.from_pretrained(model_file_path)
    tokenizer = AutoTokenizer.from_pretrained(model_file_path)
    
    # Split dataset
    train_essays, test_essays = train_test_split(df_ds, test_size=0.2, random_state=42)
    train_essays, val_essays = train_test_split(train_essays, test_size=0.33, random_state=42)

    # tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    # model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    train_essay_dataset = Dataset.from_pandas(train_essays)
    val_essay_dataset = Dataset.from_pandas(val_essays)
    test_essay_dataset = Dataset.from_pandas(test_essays)

    tokenized_train_essays = train_essay_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)
    tokenized_val_essays = val_essay_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)
    tokenized_test_essays = test_essay_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)


    # Split dataset (you might want to evaluate on a different split than training)
    # _, test_essays = train_test_split(df_ds, test_size=0.2, random_state=42)

    # Tokenize test dataset
    # test_essay_dataset = Dataset.from_pandas(test_essays)
    # tokenized_test_essays = test_essay_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)

    # Set up the Trainer
    training_args = TrainingArguments(
        output_dir="/kaggle/working/",
        per_device_eval_batch_size=16,
        no_cuda=False,  # Set True if you are not using GPU
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_val_essays,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_bert  # Assuming you have this function defined
    )

    # Evaluate the model
    eval_results = trainer.evaluate()
    print("Evaluation Results:", eval_results)

    return eval_results

# ============================================================================================


# Main Function

In [3]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import keras_nlp
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

# Set paths for Kaggle
in_base_path = r"/kaggle/input/dath-pdz/"
out_base_path = r"/kaggle/working/"
model_input_path = "/kaggle/input/distil_bert/keras/distil_bert_base_en_uncased/2"

# Load and preprocess data
file_name = os.path.join(in_base_path, 'final_dataset_v1_afternb1.csv')
df_ds, train_essays, test_essays, val_essays = load_data(file_name)

# Load the competition dataset
df_competition = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
df_competition['processed_text'] = df_competition['text'].apply(clean_text)
df_competition['processed_text_swr'] = df_competition['processed_text'].apply(remove_stop_words)
X_test_comp = df_competition['processed_text_swr'].tolist()

# Load the DistilBERT model using KerasNLP
preset = "distil_bert_base_en_uncased"

def train_and_evaluate_distilbert(train_essays, val_essays, model_file_path, n_epochs=20):
    # Load the preprocessor and model from KerasNLP
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset, sequence_length=160)
    classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset, preprocessor=preprocessor, num_classes=2)

    # Compile the model
    optimizer = Adam(learning_rate=2e-5, weight_decay=1e-5)
    classifier.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizer,
        metrics=["accuracy"]
    )
    
    # Update model file path with `.keras` extension
    model_file_with_extension = f"{model_file_path}.keras"

    # Check if a trained model already exists
    if os.path.exists(model_file_with_extension):
        print("Loading pre-trained model...")
        classifier = tf.keras.models.load_model(model_file_with_extension)
    else:
        print("Training a new model...")

        # Prepare training and validation data
        X_train = train_essays['processed_text_swr'].tolist()
        y_train = train_essays['label'].values
        X_val = val_essays['processed_text_swr'].tolist()
        y_val = val_essays['label'].values

        # Train the model using raw text inputs (let KerasNLP handle preprocessing)
        classifier.fit(
            x=X_train,
            y=y_train,
            batch_size=32,
            epochs=n_epochs,
            validation_data=(X_val, y_val)
        )

        # Save the trained model with the `.keras` extension
        classifier.save(f"{model_file_path}.keras")
        print("Model saved successfully!")

    return classifier

# Train or load the model
# classifier = tf.keras.models.load_model(model_file_path)
distilbert_model_path = os.path.join(out_base_path, 'distilbert_model')
classifier = train_and_evaluate_distilbert(train_essays, val_essays, distilbert_model_path)

# Get predictions on the competition dataset using raw text inputs
y_pred_logits = classifier.predict(X_test_comp)

# Convert logits to probabilities using softmax
y_pred_probs = tf.nn.softmax(y_pred_logits, axis=-1).numpy()

# Extract probabilities for the second class (AI-generated)
ai_generated_probs = y_pred_probs[:, 1]

# Print sample probabilities
print("Sample Predicted Probabilities:", ai_generated_probs[:10])

# Prepare the output DataFrame for submission
output = pd.DataFrame({'id': df_competition['id'], 'generated': ai_generated_probs})
output.to_csv(os.path.join(out_base_path, 'submission.csv'), index=False)

print("Submission was successfully saved!")

Label distribution before balancing:
label
0    45990
1    27473
Name: count, dtype: int64
Data is already balanced. Proceeding without SMOTE.
Training a new model...
Epoch 1/20


I0000 00:00:1731296562.510783      69 service.cc:145] XLA service 0x7f88a0003100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731296562.510844      69 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1731296562.510849      69 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5



I0000 00:00:1731296617.048185      69 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1230/1231[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 447ms/step - accuracy: 0.9364 - loss: 0.1535





[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487ms/step - accuracy: 0.9365 - loss: 0.1534





[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m774s[0m 569ms/step - accuracy: 0.9365 - loss: 0.1534 - val_accuracy: 0.9816 - val_loss: 0.0474
Epoch 2/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m654s[0m 531ms/step - accuracy: 0.9932 - loss: 0.0233 - val_accuracy: 0.9931 - val_loss: 0.0228
Epoch 3/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m652s[0m 530ms/step - accuracy: 0.9962 - loss: 0.0126 - val_accuracy: 0.9926 - val_loss: 0.0233
Epoch 4/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 530ms/step - accuracy: 0.9978 - loss: 0.0078 - val_accuracy: 0.9932 - val_loss: 0.0235
Epoch 5/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 530ms/step - accuracy: 0.9988 - loss: 0.0044 - val_accuracy: 0.9942 - val_loss: 0.0228
Epoch 6/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m652s[0m 529ms/step - accuracy: 0.9988 - loss: 0.0028 - val_accuracy: 0.9872 - val_loss: 0.0438
Epo

In [4]:
# print(X_test_comp)
# print(y_pred_logits)

In [5]:
# # Khởi tạo số luồng xử lý song song
# # max_workers = 1 

# # kaggle 
# in_base_path = r"/kaggle/input/dath-pdz/"
# out_base_path = r"/kaggle/working/"

# # in_base_path = r"E:\2_LEARNING_BKU\2_File_2\K22_HK241\CO3101_Do_an_Tri_tue_nhan_tao\Main\Dataset"
# # out_base_path = r"E:\2_LEARNING_BKU\2_File_2\K22_HK241\CO3101_Do_an_Tri_tue_nhan_tao\Output"   # đường dẫn gốc tới folder

# # Fix the file path by adding the missing backslash or using os.path.join
# file_name = os.path.join(in_base_path, 'final_dataset_v1_afternb1.csv')  # Correct file path

# # Bắt đầu theo dõi thời gian
# t_start_time = time.time()

# # Load and preprocess data
# df_ds, train_essays, test_essays, val_essays = load_data(file_name)

# # Check the size of each set
# print(f'Full set size: {len(df_ds)}')
# print(f'Training set size: {len(train_essays)}')
# print(f'Validation set size: {len(val_essays)}')
# print(f'Test set size: {len(test_essays)}')

# logger.log_message(f'Full set size: {len(df_ds)}')
# logger.log_message(f'Training set size: {len(train_essays)}')
# logger.log_message(f'Validation set size: {len(val_essays)}')
# logger.log_message(f'Test set size: {len(test_essays)}')

# # logger.log_message(f"Finished processing (total) in {t_hours} hours, {t_minutes} minutes, {t_seconds} seconds")     

# # ============================================================================================
# # Load the glove model
# word2vec_output_file = get_tmpfile(r"/kaggle/input/pdz-dath-ds/output_w2v.txt")
# # word2vec_output_file = get_tmpfile(r"E:\2_LEARNING_BKU\2_File_2\K22_HK241\CO3101_Do_an_Tri_tue_nhan_tao\Main\Dataset\output_w2v.txt")
# glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

# # Prepare train and validation embeddings
# X_train = train_essays['processed_text_swr'].tolist()
# X_val = val_essays['processed_text_swr'].tolist()
# y_train = train_essays['label'].values
# y_val = val_essays['label'].values

# # Prepare test data
# X_test = test_essays['processed_text_swr'].tolist()
# y_test = test_essays['label'].values

# # Embedding these information dataset
# X_train_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_train])
# X_val_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_val])

# X_test_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_test])

# ################################
# # Preprocess the reading new data
# # Apply the clean_text function to the 'text' column and create a new 'processed_text' column
# # Add this for reading data from competition

# df_competition = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
# test_essays_competition = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

# test_essays_competition['processed_text'] = test_essays_competition['text'].apply(clean_text)

# test_essays_competition['processed_text_swr'] = test_essays_competition['processed_text'].apply(remove_stop_words)

# X_test_comp = test_essays_competition['processed_text_swr'].tolist()
# # y_test = test_essays['label'].values

# X_test_comp_embeddings = np.array([sent2vec(sent, glove_model) for sent in X_test_comp])

# ################################


# # ============================================================================================

# # logger.log_message("============================================================================================")

# # Train the Logistic Regression model using the train and validation sets
# # logistic_model_file_path = os.path.join(out_base_path, 'logistic_regression_model.pkl')

# # def train_logistic_regression(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path, n_epochs=20, batch_size=32):
# # model = train_logistic_regression(X_train_embeddings, y_train, X_val_embeddings, y_val, logistic_model_file_path, out_base_path)



# # Test the Logistic Regression model on the test set
# # print("Testing Logistic Regression Model")
# # test_model(X_test_embeddings, y_test, logistic_model_file_path)

# # ============================================================================================

# # logger.log_message("============================================================================================")

# # # Train the XGBoost model using the train and validation sets
# # xgboost_model_file_path = os.path.join(out_base_path, 'xgboost_model.pkl')

# # # def train_xgboost(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path, n_estimators=100, learning_rate=0.01):
# # model = train_xgboost(X_train_embeddings, y_train, X_val_embeddings, y_val, xgboost_model_file_path, out_base_path)

# # # Test the XGBoost model on the test set
# # print("Testing XGBoost Model")
# # test_model(X_test_embeddings, y_test, xgboost_model_file_path)

# # ============================================================================================

# # logger.log_message("============================================================================================")

# # Train the Random Forest model using the train and validation sets
# # randomforest_model_file_path = os.path.join(out_base_path, 'randomforest_model.pkl')

# # def train_random_forest(X_train_embeddings, y_train, X_val_embeddings, y_val, model_file_path, out_base_path):
# # model = train_random_forest(X_train_embeddings, y_train, X_val_embeddings, y_val, randomforest_model_file_path, out_base_path)

# # # Test the Random Forest model on the test set
# # print("Testing Random Forest Model")
# # test_model(X_test_embeddings, y_test, randomforest_model_file_path)

# # ============================================================================================

# # logger.log_message("============================================================================================")

# # # Train the Linear SVC model using the train and validation sets
# # linear_svc_model_file_path = os.path.join(out_base_path, 'linearsvc_model.pkl')
# # model = train_linear_svc(X_train_embeddings, y_train, X_val_embeddings, y_val, linear_svc_model_file_path, out_base_path)

# # # Test the Linear SVC model on the test set
# # print("Testing linear SVC Model")
# # test_linearsvc(X_test_embeddings, y_test, linear_svc_model_file_path)

# # ============================================================================================

# # logger.log_message("============================================================================================")

# # # Train the Bidirectional LSTM RNN model using the train and validation sets
# # rnn_model_file_path = os.path.join(out_base_path, 'tf_lstm_rnn.keras')
# # rnn_plot_file_path = os.path.join(out_base_path, 'tf_lstm_rnn.png')

# # print("Training RNN Model")
# # model = train_rnn(X_train, y_train, X_val, y_val, rnn_model_file_path, rnn_plot_file_path, n_epochs=20, batch_size=32)

# # # Test the RNN model on the test set
# # print("Testing RNN Model")
# # test_rnn(X_test, y_test, rnn_model_file_path)

# # ============================================================================================

# distilbert_model_path = os.path.join(out_base_path, 'distilbert_model')  # Use folder path without file extension

# print("Training DistilBERT Model")
# # Example usage:
# distilbert_model_path = "./distilbert_model"
# model = train_distilbert(df_ds, distilbert_model_path, n_epochs=2)

# # Load the model and tokenizer from your custom path
# # model = tf.keras.models.load_model(distilbert_model_path)
# tokenizer = load_tokenizer(os.path.join(model_input_path, "tokenizer"))
# print("Model and tokenizer loaded successfully!")

# # Create the test dataset
# test_dataset = Dataset.from_pandas(X_test_comp)

# # Tokenize the test dataset
# tokenized_test = test_dataset.map(lambda examples: tokenizer(examples['processed_text_swr'], max_length=128, padding=True, truncation=True), batched=True)


# # Get predictions on the test set
# y_pred_logits = classifier.predict(X_test)
# y_pred_probs = tf.sigmoid(y_pred_logits).numpy().flatten()

# # Print the first 10 predicted probabilities
# print("Sample Predicted Probabilities:", y_pred_probs[:10])

# # Create the DataFrame and save to CSV
# output = pd.DataFrame({'id': labels_test, 'generated': preds_test})
# output.to_csv('submission.csv', index=False)

# print("Submission was successfully saved!")

# # # Test the DistilBERT model on the test set
# # print("Testing DistilBERT Model")
# # test_distilbert(test_essays, distilbert_model_path)

# # ============================================================================================

# # Kết thúc theo dõi thời gian
# t_end_time = time.time()
# t_processing_time = t_end_time - t_start_time

# # Convert minutes to hours and minutes
# t_hours = int(t_processing_time // 3600)  # Lấy số giờ
# t_minutes = int((t_processing_time % 3600) // 60)  # Lấy số phút
# t_seconds = int(t_processing_time % 60)  # Lấy số giây

# logger.log_message(f"Finished processing (total) in {t_hours} hours, {t_minutes} minutes, {t_seconds} seconds")       


In [6]:
# test_essays_competition

In [7]:
# X_test_comp_embeddings

In [8]:
# !rm -rf /kaggle/working/*

In [9]:
# !zip -r file.zip /kaggle/working

# End