In [None]:
import pandas as pd
import json


In [None]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [None]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
# org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')


We have 3 data in this competition. Since the train data given in the competition was insufficient, we used data collected by other users.

In [None]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

<div style="color: black;
            background-color: #800000; /* Doğru bordo tonu */
            border-radius: 10px; /* Köşeleri yuvarlamak için */
            padding: 20px; /* Kenar boşluğu eklemek için */
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.3); /* Hafif gölgelendirme eklemek için */">
    <p style="color: white; /* Beyaz yazı için */
              font-family: Verdana, sans-serif;
              letter-spacing: 0.5px;
              text-align: center; /* Metni ortalamak için */
              margin: 0;">
Kod Blok No:1 / Code Block No.1    </p>
</div>


In [None]:
LOWERCASE = False
VOCAB_SIZE = 14000000

These two variables are configuration parameters, possibly specified for use in a natural language processing (NLP) or text mining application.

* LOWERCASE = False: This variable usually determines whether text data should be converted to lower case. If LOWERCASE is True, the text data will be converted to lower case; if False, it will not be converted and the original case structure will be preserved.

* VOCAB_SIZE = 14000000: This variable determines the number of words to be used when possibly building a vocabulary. Especially in NLP applications, using a large vocabulary can enable the model to learn a larger vocabulary, but may require more memory and processing power. The VOCAB_SIZE parameter is usually set to control the number of words to use in the model's training process.

<div style="color: black;
            background-color: #800000; /* Doğru bordo tonu */
            border-radius: 10px; /* Köşeleri yuvarlamak için */
            padding: 20px; /* Kenar boşluğu eklemek için */
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.3); /* Hafif gölgelendirme eklemek için */">
    <p style="color: white; /* Beyaz yazı için */
              font-family: Verdana, sans-serif;
              letter-spacing: 0.5px;
              text-align: center; /* Metni ortalamak için */
              margin: 0;">
Kod Blok No:2 / Code Block No.2     </p>
</div>


In [None]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

# Creating huggingface dataset object
dataset = Dataset.from_pandas(train[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

 This code seems to be designed to train a Byte-Pair Encoding (BPE) based tokenizer using Hugging Face's tokenizers and datasets libraries.

* raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]")): Creating a BPE based tokenizer. The special unknown token [UNK] is defined in this tokenizer.

* raw_tokenizer.normalizer and raw_tokenizer.pre_tokenizer: Normalization and pre-processing steps are added to the tokenizer. The normalization steps include Unicode normalization and optional lowercase conversion. The lowercase conversion decision is made by checking the LOWERCASE variable.

* trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens): Trainer contains the settings for training the tokenizer. vocab_size contains the size of a given vocabulary and special_tokens contains the special tokens.

* dataset = Dataset.from_pandas(train[[['text']]): Creating a dataset using Hugging Face's datasets library.

* def train_corp_iter(): ...: Define a function that creates an iterator by splitting the training data into small chunks.

* raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer): The tokenizer is trained on the training data.

* tokenizer = PreTrainedTokenizerFast(...): Creating a huggingface tokenizer using Hugging Face's PreTrainedTokenizerFast class. This can then be used for model training or tasks such as text attribute extraction.

* tokenized_texts_test and tokenized_texts_train: Using the trained tokenizer, texts from test and training data are tokenized.

In [None]:
tokenized_texts_test[1]

This code will print the second instance in the tokenized_texts_test list.

<div style="color: black;
            background-color: #800000; /* Doğru bordo tonu */
            border-radius: 10px; /* Köşeleri yuvarlamak için */
            padding: 20px; /* Kenar boşluğu eklemek için */
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.3); /* Hafif gölgelendirme eklemek için */">
    <p style="color: white; /* Beyaz yazı için */
              font-family: Verdana, sans-serif;
              letter-spacing: 0.5px;
              text-align: center; /* Metni ortalamak için */
              margin: 0;">
Kod Blok No:4 / Code Block No.4     </p>
</div>


In [None]:
def dummy(text):
    return text

In [None]:
def dummy(text):
    return text

vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

This script is designed to convert text data to vectors using TfidfVectorizer. Below is a step-by-step explanation of the code:

* def dummy(text): return text: This is a simple function that directly returns an array of text. The purpose of this function is to provide a tokenization function to TfidfVectorizer.

* vectorizer = TfidfVectorizer(...): Creating a TfidfVectorizer object. This vectorizer is used to convert text data into vectors with the TF-IDF (Term Frequency-Inverse Document Frequency) representation. The expression ngram_range=(1, 1) indicates that it will use unigrams (single words).

* vectorizer.fit(tokenized_texts_test): Fitting the vectorizer on the test data. This step means training the vectorizer and collecting the statistics needed to represent the test data.

* vocab = vectorizer.vocabulary_: The vocabulary_ property is used to get the vocabulary of the vectorizer. This returns a dictionary containing the vocabulary that the vectorizer has learned.

* vectorizer = TfidfVectorizer(..., vocabulary=vocab): This time, the vocabulary obtained in the previous step is specified using the vocabulary parameter. This will be used to convert the training and test data into vectors using the same vocabulary.

* tf_train = vectorizer.fit_transform(tokenized_texts_train): The training data is transformed into TF-IDF representatives using the specified vocabulary.

* tf_test = vectorizer.transform(tokenized_texts_test): Test data is transformed into TF-IDF representatives using the same vocabulary without the need to fit the previous vectorizer.

* del vectorizer and gc.collect(): Deleting the vectorizer from memory and clearing memory. These steps are done to reduce unnecessary memory usage.

<div style="color: black;
            background-color: #800000; /* Doğru bordo tonu */
            border-radius: 10px; /* Köşeleri yuvarlamak için */
            padding: 20px; /* Kenar boşluğu eklemek için */
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.3); /* Hafif gölgelendirme eklemek için */">
    <p style="color: white; /* Beyaz yazı için */
              font-family: Verdana, sans-serif;
              letter-spacing: 0.5px;
              text-align: center; /* Metni ortalamak için */
              margin: 0;">
Kod Blok No:5 / Code Block No.5     </p>
</div>


In [None]:
y_train = train['label'].values

In [None]:
def get_model():
    from catboost import CatBoostClassifier

#     clf2 = MultinomialNB(alpha=0.01)
    clf = MultinomialNB(alpha=0.0225)
#     clf2 = MultinomialNB(alpha=0.01)
    sgd_model = SGDClassifier(max_iter=9000, tol=1e-4, loss="modified_huber", random_state=6743) 
    p6={'n_iter': 3000,'verbose': -1,'objective': 'cross_entropy','metric': 'auc',
        'learning_rate': 0.00581909898961407, 'colsample_bytree': 0.78,
        'colsample_bynode': 0.8,
#         'lambda_l1': 4.562963348932286, 
       # 'lambda_l2': 2.97485, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898
       }
    p6["random_state"] = 6743
    lgb=LGBMClassifier(**p6)
    cat=CatBoostClassifier(iterations=3000,
                           verbose=0,
                           random_seed=6543,
#                            l2_leaf_reg=6.6591278779517808,
                           learning_rate=0.005599066836106983,
                           subsample = 0.35,
                           allow_const_label=True,loss_function = 'CrossEntropy')
    weights = [0.2,0.31,0.31,0.46]
 
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model),
                                            ('lgb',lgb), 
                                            ('cat', cat)
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    return ensemble

model = get_model()
print(model)

if len(test.text.values) <= 5:
    # if not, just sample submission
    sub.to_csv('submission.csv', index=False)
else:
    model.fit(tf_train, y_train)

    gc.collect()

    final_preds = model.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds
    sub.to_csv('submission.csv', index=False)
    sub

This code block creates an ensemble model and uses it to classify or predict the probability of text data. 

* get_model() function: This function creates and returns an ensemble model. The ensemble model consists of the MultinomialNB (Naive Bayes), SGDClassifier (Stochastic Gradient Descent), LGBMClassifier (LightGBM), and CatBoostClassifier models. VotingClassifier is used to combine the predictions of these models.

* model = get_model(): The ensemble model is created by calling the get_model function.

* if len(test.text.values) <= 5:: If the number of samples in the test dataset is less than 5, only the sample is used to save a submission file. In this case, the model is not trained and a sample submission file (submission.csv) is created.

* else:: If the number of samples in the test dataset is greater than 5, the model is trained and prediction is performed on the test data. The prediction results are saved in the submission.csv file.

* Note: If you will examine the code in the else block, the line model.fit(tf_train, y_train) performs the training of the model. tf_train refers to a NumPy array containing training data and y_train labels containing TF-IDF representatives. The prediction results are then retrieved using model.predict_proba(tf_test)[:,1] and added to the sub['generated'] column. Finally, these predictions are saved in the submission.csv file.