In [1]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [2]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [3]:
train = train.drop_duplicates(subset=['text'])

train.reset_index(drop=True, inplace=True)

In [4]:
test.text.values

array(['Aaa bbb ccc.', 'Bbb ccc ddd.', 'CCC ddd eee.'], dtype=object)

In [5]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [6]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

# Tokenize train set
tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [7]:
tokenized_texts_test[1]

['ĠB', 'b', 'b', 'Ġc', 'cc', 'Ġd', 'dd', '.']

In [8]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text as it is since we already tokenized it.
    """
    return text

In [9]:
# Fitting TfidfVectoizer on test set

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode'
                            )

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠA aa Ġb': 21, 'aa Ġb b': 6, 'Ġb b b': 30, 'b b Ġc': 9, 'b Ġc cc': 13, 'Ġc cc .': 33, 'ĠA aa Ġb b': 22, 'aa Ġb b b': 7, 'Ġb b b Ġc': 31, 'b b Ġc cc': 10, 'b Ġc cc .': 14, 'ĠA aa Ġb b b': 23, 'aa Ġb b b Ġc': 8, 'Ġb b b Ġc cc': 32, 'b b Ġc cc .': 11, 'ĠB b b': 24, 'Ġc cc Ġd': 34, 'cc Ġd dd': 17, 'Ġd dd .': 37, 'ĠB b b Ġc': 25, 'b Ġc cc Ġd': 15, 'Ġc cc Ġd dd': 35, 'cc Ġd dd .': 18, 'ĠB b b Ġc cc': 26, 'b b Ġc cc Ġd': 12, 'b Ġc cc Ġd dd': 16, 'Ġc cc Ġd dd .': 36, 'ĠC C C': 27, 'C C Ġd': 0, 'C Ġd dd': 3, 'Ġd dd Ġe': 38, 'dd Ġe ee': 19, 'Ġe ee .': 41, 'ĠC C C Ġd': 28, 'C C Ġd dd': 1, 'C Ġd dd Ġe': 4, 'Ġd dd Ġe ee': 39, 'dd Ġe ee .': 20, 'ĠC C C Ġd dd': 29, 'C C Ġd dd Ġe': 2, 'C Ġd dd Ġe ee': 5, 'Ġd dd Ġe ee .': 40}


23

In [10]:
y_train = train['label'].values

Just some sanity checks...

In [11]:
tf_train

<44868x42 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [12]:
tf_train.shape

(44868, 42)

In [13]:
tf_test.shape

(3, 42)

### A basic classifier pipeline with minimal tweaks from public notebooks.

In [14]:
bayes_model = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")


ensemble = VotingClassifier(estimators=[('sgd', sgd_model), ('nb', bayes_model)],
                            weights=[0.7, 0.3], voting='soft', n_jobs=-1)
ensemble.fit(tf_train, y_train)


gc.collect()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

31

In [15]:
final_preds = ensemble.predict_proba(tf_test)[:,1]

In [16]:
sub['generated'] = final_preds
sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,id,generated
0,0000aaaa,0.390007
1,1111bbbb,0.390007
2,2222cccc,0.390007
