In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv
/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv


## Detection of AI generated Text using Sentence peice construction

### Inspiration and Credits 🙌
This notebook is inspired by the work of s3nh, available at [this Kaggle project]( https://www.kaggle.com/code/hubert101/0-960-phrases-are-keys). I extend my gratitude to s3nh for sharing their insights and code.

---

### 🚀 How the Notebook Works:

- **Data Loading:** Initial cell loads essential libraries and imports data from various CSV files.
  
- **Text Tokenization:** Utilizes Byte-Pair Encoding (BPE) for tokenization, creating a robust representation of text.

- **TF-IDF Vectorization:** Implements TF-IDF vectorization on the tokenized texts, capturing important features.

- **Model Training:** Constructs an ensemble of machine learning models (Multinomial Naive Bayes, SGD, LightGBM, CatBoost) to achieve optimal predictions.

- **Submission Generation:** Generates predictions and outputs a submission file ('submission.csv').

### All the best!

In [2]:
import sys
import gc

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier



Read the data into train and test sets

In [3]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

Drop out duplicates from the trainset as well reset the indices

In [4]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

Make the text-processing case sensitive and define the VOCAB_SIZE as 30522

In [5]:
LOWERCASE = False
VOCAB_SIZE = 30522

## Byte Pair Encoding Tokenizer Training

We will use the Byte-Pair Encoding tokenizer to tokenize the train and test data

In [6]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  if _pandas_api.is_sparse(col):


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

## TF-IDF Vectorization

We will use TF-IDF Vectorization technique to extract important features from the tokenized train and test data. 

We first configure the vectorizer on the test data to obtain a vocabulary and then we reconfigure the vectorizer using the vobcabulary to generate. Tf-Idf features from the tokens of the train data.

NOTE: The dummy function is defined as a placeholder for the TF-Idf Vectorizer

In [7]:
def dummy(text):
    return text

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


53

Extract the train labels

In [9]:
y_train = train['label'].values

## Model Training and Prediction

This cell involves training a combination of machine learning models and generating predictions. 

1. **Checking Test Data Size:**
   ```python
   if len(test.text.values) <= 5:
       sub.to_csv('submission.csv', index=False)
   ```
   - Checks if the size of the test data is less than or equal to 5. If true, it writes the existing submission (`sub`) to a CSV file named 'submission.csv' and exits the cell.
   - This condition may be added as a safeguard or for special handling when dealing with very small datasets.

2. **Machine Learning Models Configuration and Training:**
   ```python
   else:
       clf = MultinomialNB(alpha=0.02)
       sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
       p6 = {'n_iter': 2500, 'verbose': -1, 'objective': 'cross_entropy', 'metric': 'auc',
             'learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955,
             'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, 
             'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
       lgb = LGBMClassifier(**p6)
       cat = CatBoostClassifier(iterations=2000,
                                verbose=0,
                                l2_leaf_reg=6.6591278779517808,
                                learning_rate=0.005689066836106983,
                                subsample=0.4,
                                allow_const_label=True, loss_function='CrossEntropy')
   ```
   - Configures three machine learning models: Multinomial Naive Bayes (`clf`), Stochastic Gradient Descent (`sgd_model`), LightGBM (`lgb`), and CatBoost (`cat`).
   - The parameters for LightGBM (`p6`) are specified separately.

3. **Configuring Ensemble Model:**
   ```python
       weights = [0.07, 0.31, 0.31, 0.31]
       ensemble = VotingClassifier(estimators=[('mnb', clf),
                                               ('sgd', sgd_model),
                                               ('lgb', lgb), 
                                               ('cat', cat)],
                                   weights=weights, voting='soft', n_jobs=-1)
   ```
   - Creates an ensemble model (`ensemble`) using a soft voting strategy, where each model contributes its probability estimates.
   - The ensemble consists of the Multinomial Naive Bayes, Stochastic Gradient Descent, LightGBM, and CatBoost classifiers, with specified weights.

4. **Training Ensemble Model:**
   ```python
       ensemble.fit(tf_train, y_train)
   ```
   - Fits the ensemble model on the TF-IDF transformed train data (`tf_train`) with the corresponding target labels (`y_train`).

5. **Generating Predictions:**
   ```python
       gc.collect()
       final_preds = ensemble.predict_proba(tf_test)[:, 1]
       sub['generated'] = final_preds
   ```
   - Performs garbage collection to free up memory.
   - Generates final predictions using the trained ensemble model on the TF-IDF transformed test data (`tf_test`).
   - Adds the generated predictions to the 'generated' column in the `sub` DataFrame.

6. **Saving Submission File:**
   ```python
       sub.to_csv('submission.csv', index=False)
       sub
   ```
   - Writes the updated `sub` DataFrame, including the generated predictions, to a CSV file named 'submission.csv'.
   - Prints the `sub` DataFrame, possibly for inspection.



In [10]:
if len(test.text.values) <= 5:
    sub.to_csv('submission.csv', index=False)
else:
    clf = MultinomialNB(alpha=0.02)
#     clf2 = MultinomialNB(alpha=0.01)
    sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
    p6={'n_iter': 2500,'verbose': -1,'objective': 'cross_entropy','metric': 'auc',
        'learning_rate': 0.05081909898961407, 'colsample_bytree': 0.726023996436955,
        'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, 
        'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
    lgb=LGBMClassifier(**p6)
    cat=CatBoostClassifier(iterations=2000,
                           verbose=0,
                           l2_leaf_reg=6.6591278779517808,
                           learning_rate=0.005599066836106983,
                           subsample = 0.4,
                           allow_const_label=True,loss_function = 'CrossEntropy')
    weights = [0.07,0.31,0.31,0.31]
 
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model),
                                            ('lgb',lgb), 
                                            ('cat', cat)
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble.fit(tf_train, y_train)
    gc.collect()
    final_preds = ensemble.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds
    sub.to_csv('submission.csv', index=False)
    sub

In [11]:
sub

Unnamed: 0,id,generated
0,0000aaaa,0.1
1,1111bbbb,0.9
2,2222cccc,0.4
