In [None]:
import sys
import os
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    SentencePieceBPETokenizer
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [None]:
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    pass
else:
    sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
    sub.to_csv('submission.csv', index=False)


In [None]:
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
org_test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
daigt_train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')
aug_train = pd.read_csv('/kaggle/input/augmented-data-for-llm-detect-ai-generated-text/final_train.csv')
aug_test = pd.read_csv('/kaggle/input/augmented-data-for-llm-detect-ai-generated-text/final_test.csv')
train = daigt_train
test = org_test

In [None]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)
y_train = train['label'].values

In [None]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [None]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = SentencePieceBPETokenizer()

# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 300):
        yield dataset[i : i + 300]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter())

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object = raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))


# Tokenize train set
tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

In [None]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text as it is since we already tokenized it.
    """
    return text

In [None]:
# Fitting TfidfVectoizer on train set
def fitting_vectorizer_on_train(a, b):
    vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
        tokenizer = dummy,
        preprocessor = dummy,
        token_pattern = None#, strip_accents='unicode'
                                )

    vectorizer.fit(a)

    # Getting vocab
    vocab = vectorizer.vocabulary_

    # Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
    vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                                analyzer = 'word',
                                tokenizer = dummy,
                                preprocessor = dummy,
                                token_pattern = None#, strip_accents='unicode'
                                )

    tf_test = vectorizer.fit_transform(b)
    tf_train = vectorizer.transform(a)

    del vectorizer
    gc.collect()
    return(tf_train, tf_test)  

In [None]:
# Fitting TfidfVectoizer on test set
def fitting_vectorizer_on_test(a, b):
    vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
        tokenizer = dummy,
        preprocessor = dummy,
        token_pattern = None#, strip_accents='unicode'
                                )

    vectorizer.fit(b)

    # Getting vocab
    vocab = vectorizer.vocabulary_

    # Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
    vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                                analyzer = 'word',
                                tokenizer = dummy,
                                preprocessor = dummy,
                                token_pattern = None#, strip_accents='unicode'
                                )

    tf_train = vectorizer.fit_transform(a)
    tf_test = vectorizer.transform(b)

    del vectorizer
    gc.collect()
    return(tf_train, tf_test)  

In [None]:
def calculate_voting(tf_train, tf_test, y_train):
    clf = MultinomialNB(alpha=0.02)
    lr = LogisticRegression()
    clf2 = MultinomialNB(alpha=0.01)
    
    sgd_model = SGDClassifier(max_iter=6000, tol=1e-4, loss="modified_huber") 
    p6={'n_iter': 2000,'verbose': -1,'objective': 'cross_entropy','metric': 'auc','learning_rate': 0.05073909898961407, \
        'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, \
        'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
    lgb=LGBMClassifier(**p6)

    cat=CatBoostClassifier(
        iterations=2000,
        verbose=0,
        l2_leaf_reg=6.6591278779517808,
        learning_rate=0.005689066836106983,
        allow_const_label=True,
        subsample=0.4,
        loss_function='CrossEntropy'
    )
    
    weights = [50, 50, 50]

    # Creating the ensemble model
    ensemble = VotingClassifier(estimators=[
        ('mnb', clf),
        ('sgd', sgd_model), 
        ('cat', cat)],
        weights = [w/sum(weights) for w in weights],
        voting='soft',
        n_jobs=-1)

    # Fit the ensemble model
    ensemble.fit(tf_train, y_train)
    final_preds = ensemble.predict_proba(tf_test)[:,1]
    # Garbage collection
    gc.collect()
    return(final_preds)

In [None]:
print('fitting!')
# tf_train, tf_test = fitting_vectorizer_on_train(tokenized_texts_train, tokenized_texts_test)  
tf_train, tf_test = fitting_vectorizer_on_test(tokenized_texts_train, tokenized_texts_test)  
print('voting!')
final_preds_submission = calculate_voting(tf_train, tf_test, y_train)
_ = gc.collect()

In [None]:
final_preds = final_preds_submission
sub['generated'] = final_preds
sub.to_csv('submission.csv', index=False)
sub