In [1]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

### This notebook is based upon the work presented in this amazing notebook by ERTUĞRUL DEMIR
### LGBM Parameters taken from : https://www.kaggle.com/code/siddhvr/llm-daigt-sub
### Inspired by VLADIMIR  https://www.kaggle.com/code/verracodeguacas/sentencepiece-constructions

In [2]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
val = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [3]:
train = train.drop_duplicates(subset=['text'])
train.shape

(44868, 5)

In [4]:
val = val.drop_duplicates(subset=['text'])
val.shape

(1378, 4)

In [5]:
val.rename(columns={'generated': 'label'}, inplace=True)

In [6]:
val.head()

Unnamed: 0,id,prompt_id,text,label
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [7]:
train.label.value_counts()

label
0    27371
1    17497
Name: count, dtype: int64

In [8]:
val.label.value_counts()

label
0    1375
1       3
Name: count, dtype: int64

In [9]:
test.text.values

array(['Aaa bbb ccc.', 'Bbb ccc ddd.', 'CCC ddd eee.'], dtype=object)

In [10]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [11]:
from tokenizers import SentencePieceBPETokenizer

# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = SentencePieceBPETokenizer()


# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter())

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object = raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))


# Tokenize train set
tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))
    
# # Tokenize validation set
# tokenized_texts_val = []

# for text in tqdm(val['text'].tolist()):
#     tokenized_texts_val.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [12]:
tokenized_texts_test[1]

['Ġ', 'B', 'bb', 'Ġccc', 'Ġddd', '.']

In [13]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text as it is since we already tokenized it.
    """
    return text

In [14]:
# Fitting TfidfVectoizer on test set

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None#, strip_accents='unicode'
                            )

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)


# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)
# tf_val = vectorizer.fit_transform(tokenized_texts_val)

del vectorizer
gc.collect()

{'Ġ A a': 24, 'A a a': 0, 'a a Ġ': 12, 'a Ġ bb': 15, 'Ġ bb b': 33, 'bb b Ġccc': 19, 'b Ġccc .': 18, 'Ġ A a a': 25, 'A a a Ġ': 1, 'a a Ġ bb': 13, 'a Ġ bb b': 16, 'Ġ bb b Ġccc': 34, 'bb b Ġccc .': 20, 'Ġ A a a Ġ': 26, 'A a a Ġ bb': 2, 'a a Ġ bb b': 14, 'a Ġ bb b Ġccc': 17, 'Ġ bb b Ġccc .': 35, 'Ġ B bb': 27, 'B bb Ġccc': 3, 'bb Ġccc Ġddd': 21, 'Ġccc Ġddd .': 38, 'Ġ B bb Ġccc': 28, 'B bb Ġccc Ġddd': 4, 'bb Ġccc Ġddd .': 22, 'Ġ B bb Ġccc Ġddd': 29, 'B bb Ġccc Ġddd .': 5, 'Ġ CC C': 30, 'CC C Ġddd': 9, 'C Ġddd Ġ': 6, 'Ġddd Ġ ee': 39, 'Ġ ee e': 36, 'ee e .': 23, 'Ġ CC C Ġddd': 31, 'CC C Ġddd Ġ': 10, 'C Ġddd Ġ ee': 7, 'Ġddd Ġ ee e': 40, 'Ġ ee e .': 37, 'Ġ CC C Ġddd Ġ': 32, 'CC C Ġddd Ġ ee': 11, 'C Ġddd Ġ ee e': 8, 'Ġddd Ġ ee e .': 41}


23

In [15]:
y_train = train['label'].values
y_val = [float(x) for x in val['label'].values]

In [16]:
tf_train.shape

(44868, 42)

Just some sanity checks...

## LGBM Parameters taken from 
https://www.kaggle.com/code/siddhvr/llm-daigt-sub

In [17]:
# Define the range for w1
w1_range = [0.1]

# Create a dictionary to store w1 and corresponding accuracy
accuracy_map = {}

for w in w1_range:
    clf = MultinomialNB(alpha=0.02)
    # clf2 = MultinomialNB(alpha=0.01)
    sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
    p6={'n_iter': 1500,'verbose': -1,'objective': 'binary','metric': 'auc','learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
    lgb = LGBMClassifier(**p6)
    # p7 = {'n_iter': 2000, 'verbose': -1, 'learning_rate': 0.0055, 'colsample_bytree': 0.9, 'colsample_bynode': 0.6, 'lambda_l1': 7.5, 'lambda_l2': 6.5, 'min_data_in_leaf' : 150, 'max_depth': 10, 'max_bin': 800}
    # lgb2 = LGBMClassifier(**p7)

    # Creating the ensemble model
    ensemble = VotingClassifier(estimators=[
        ('mnb', clf),
        ('sgd', sgd_model),
        ('lgb', lgb)], 
        weights=[w,(1-w)/2,(1-w)/2],  
        voting='soft', 
        n_jobs=-1)

    # Fit the ensemble model
    ensemble.fit(tf_train, y_train)
    final_preds_sentencePiece = ensemble.predict_proba(tf_test)[:,1]
#     delta = [(a-b)**2 for a,b in zip(y_val,final_preds_sentencePiece)]
#     error = sum(delta)/len(delta)
#     accuracy_map[w] = error

    # Print the accuracy for the current iteration
#     print(f'w: {w}, Error: {error}')
    
    # Garbage collection
    gc.collect()

# # Find the value of w1 corresponding to max accuracy
# min_error_w = min(accuracy_map, key=accuracy_map.get)

# # Print the results
# print(f'Min Error: {accuracy_map[min_error_w]} at w = {min_error_w}')

# # Return the accuracy map
# accuracy_map

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



## Byte-Pair Encoding Tokenizer

In [18]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)







In [19]:
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

# # Tokenize validation set
# tokenized_texts_val = []

# for text in tqdm(val['text'].tolist()):
#     tokenized_texts_val.append(tokenizer.tokenize(text))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [20]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode'
                            )

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
# tf_val = vectorizer.fit_transform(tokenized_texts_val)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


23

In [21]:
clf = MultinomialNB(alpha=0.02)
# clf2 = MultinomialNB(alpha=0.01)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
p6=p6={'n_iter': 1500,'verbose': -1,'objective': 'binary','metric': 'auc','learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
lgb=LGBMClassifier(**p6)

ensemble_bytePair = VotingClassifier(estimators=[('mnb',clf),('sgd', sgd_model),('lgb',lgb)],
                            weights=[0.1,0.45,0.45], voting='soft', n_jobs=-1)

ensemble_bytePair.fit(tf_train, y_train)

gc.collect()





55

In [22]:
final_preds_bytePair = ensemble_bytePair.predict_proba(tf_test)[:,1]

In [23]:
final_preds = (final_preds_sentencePiece + final_preds_bytePair) / 2

In [24]:
sub['generated'] = final_preds
sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,id,generated
0,0000aaaa,0.39366
1,1111bbbb,0.420107
2,2222cccc,0.420111
