In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


from transformers import PreTrainedTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


from datasets import Dataset
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import sys
import gc

In [3]:
# Different from online submission that use absolute path, we here use relative path for convenience

test = pd.read_csv('llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('llm-detect-ai-generated-text/sample_submission.csv')
train = pd.read_csv("daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [4]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)
train.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [5]:
excluded_prompt_name_list = ['Distance learning','Grades for extracurricular activities','Summer projects']
train = train[~(train['prompt_name'].isin(excluded_prompt_name_list))]
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [6]:
test.text.values

array(["The time for the automobile is slowly passing. More and more people prefer carpooling and walking then driving a car. Why is that? It could be due to the unemployment rate skyrocketing or less people wanting a car. There are many benefits to driving a car. Being able to get to point A to point B, faster travel time, and it is more private than a bus. But what are the downsides? Pollution, more antisocial humans, and the cacophony of horns and engines. Giving up the car should be on every countries todo list.\n\nSome nations, such as Paris, France, find out about the downside of cars the hardway. Pollution is a very real problem in todays society. In 2014 the city had a nearrecord high of pollution, rivaling Beijing, China, the most polluted city in the world source 2. After debating, the French ruling party decided to ban driving for odd and even licence plate numbers on opposite days until the smog count went down. It took five days, but they managed to have most of the smog c

In [7]:
LOWERCASE = False
VOCAB_SIZE = 305220

In [8]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))


# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] 
    if LOWERCASE else []
)


raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=VOCAB_SIZE, 
    special_tokens=special_tokens
)


# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token  = "[UNK]",
    pad_token  = "[PAD]",
    cls_token  = "[CLS]",
    sep_token  = "[SEP]",
    mask_token = "[MASK]",
)



# Tokenize test set with new tokenizer
tokenized_texts_test = []
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))


# Tokenize train set
tokenized_texts_train = []
for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/34497 [00:00<?, ?it/s]

In [9]:
print(tokenized_texts_test[1])
print()
print(tokenized_texts_test[2])

['ĠCars', 'Ġhave', 'Ġbeen', 'Ġpart', 'Ġof', 'Ġour', 'Ġdaily', 'Ġlives', 'Ġfor', 'Ġdecades', 'Ġand', 'Ġhave', 'Ġbecome', 'Ġsomewhat', 'Ġidolized', 'Ġin', 'ĠAmerican', 'Ġculture', '.', 'ĠBut', 'Ġcars', 'Ġare', 'Ġslowly', 'Ġkilling', 'Ġour', 'Ġplanet', 'Ġand', 'Ġare', 'Ġspeeding', 'Ġup', 'Ġthe', 'ĠGreenhouse', 'Ġeffect', 'Ġtaking', 'Ġplace', 'Ġwithin', 'Ġour', 'ĠEarths', 'Ġatmosphere', '.', 'ĠWe', "'ve", 'Ġbecome', 'Ġentirely', 'Ġtoo', 'Ġdependent', 'Ġon', 'Ġcars', 'Ġto', 'Ġget', 'Ġus', 'Ġwhere', 'Ġwe', 'Ġwish', 'Ġto', 'Ġgo', ',', 'Ġmaking', 'Ġmany', 'Ġpeople', 'Ġlazy', '.', 'ĠCars', 'Ġalso', 'Ġemit', 'Ġgasses', 'Ġthat', 'Ġcan', 'Ġharm', 'Ġour', 'Ġinternal', 'Ġbody', 'Ġstructures', ',', 'Ġcausing', 'Ġproblems', 'Ġwith', 'Ġlung', 'Ġdiseases', 'Ġand', 'Ġjust', 'Ġthe', 'Ġsimple', 'Ġact', 'Ġof', 'Ġbreathing', '.', 'ĠMany', 'Ġcities', 'Ġfrom', 'Ġall', 'Ġover', 'Ġthe', 'Ġworld', 'Ġhave', 'Ġtaken', 'Ġmeasures', 'Ġto', 'Ġlimit', 'Ġthe', 'Ġuse', 'Ġof', 'Ġcars', 'Ġin', 'Ġdaily', 'Ġlife', ',', 'Ġmak

In [10]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. 
    It returns the text as it is since we already tokenized it.
    """
    return text



# Fitting TfidfVectoizer on test set
vectorizer = TfidfVectorizer(
    ngram_range   = (3, 5), 
    lowercase     = False, 
    sublinear_tf  = True, 
    analyzer      = 'word',
    tokenizer     = dummy,
    preprocessor  = dummy,
    token_pattern = None, 
    strip_accents ='unicode')


vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_
print(vocab)


# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(
    ngram_range    = (3, 5), 
    lowercase      = False, 
    sublinear_tf   = True, 
    vocabulary     = vocab,
    analyzer       = 'word',
    tokenizer      = dummy,
    preprocessor   = dummy,
    token_pattern  = None, 
    strip_accents  ='unicode')

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)
del vectorizer
gc.collect()

{'ĠThe Ġtime Ġfor': 3205, 'Ġtime Ġfor Ġthe': 14130, 'Ġfor Ġthe Ġautomobile': 7482, 'Ġthe Ġautomobile Ġis': 13040, 'Ġautomobile Ġis Ġslowly': 4781, 'Ġis Ġslowly Ġpassing': 8945, 'Ġslowly Ġpassing .': 12269, 'Ġpassing . ĠMore': 11115, '. ĠMore Ġand': 1148, 'ĠMore Ġand Ġmore': 2936, 'Ġand Ġmore Ġpeople': 4320, 'Ġmore Ġpeople Ġprefer': 9934, 'Ġpeople Ġprefer Ġcarpooling': 11252, 'Ġprefer Ġcarpooling Ġand': 11578, 'Ġcarpooling Ġand Ġwalking': 5673, 'Ġand Ġwalking Ġthen': 4430, 'Ġwalking Ġthen Ġdriving': 15228, 'Ġthen Ġdriving Ġa': 13763, 'Ġdriving Ġa Ġcar': 6717, 'Ġa Ġcar .': 3486, 'Ġcar . ĠWhy': 5537, '. ĠWhy Ġis': 1375, 'ĠWhy Ġis Ġthat': 3381, 'Ġis Ġthat ?': 8957, 'Ġthat ? ĠIt': 12816, '? ĠIt Ġcould': 1541, 'ĠIt Ġcould Ġbe': 2867, 'Ġcould Ġbe Ġdue': 6265, 'Ġbe Ġdue Ġto': 4856, 'Ġdue Ġto Ġthe': 6781, 'Ġto Ġthe Ġunemployment': 14605, 'Ġthe Ġunemployment Ġrate': 13572, 'Ġunemployment Ġrate Ġskyrocketing': 14803, 'Ġrate Ġskyrocketing Ġor': 11787, 'Ġskyrocketing Ġor Ġless': 12246, 'Ġor Ġless Ġ

69

In [24]:
y_train = train['label'].values

In [12]:
tf_train

<34497x16199 sparse matrix of type '<class 'numpy.float64'>'
	with 1044790 stored elements in Compressed Sparse Row format>

In [13]:
tf_train.shape

(34497, 16199)

In [14]:
tf_test.shape

(10, 16199)

In [22]:
def get_model():
    from catboost import CatBoostClassifier
    from sklearn.linear_model import LogisticRegression

    clf = MultinomialNB(alpha=0.0225)

    sgd_model = SGDClassifier(
        max_iter=9000, 
        tol=1e-4, 
        loss="modified_huber", 
        random_state=6743
    )

    p6 = {
        'n_iter': 3000, 
        'verbose': -1, 
        'objective': 'cross_entropy', 
        'metric': 'auc',
        'learning_rate': 0.00581909898961407, 
        'colsample_bytree': 0.78, 
        'colsample_bynode': 0.8
    }
    p6["random_state"] = 6743
    lgb = LGBMClassifier(**p6)

    # CatBoost classifier with specified iterations, learning rate, and subsample
    cat = CatBoostClassifier(
        iterations=3000, 
        verbose=0, 
        random_seed=6543,
        learning_rate=0.005599066836106983, 
        subsample=0.35,
        allow_const_label=True, 
        loss_function='CrossEntropy'
    )

    # Weights for the Voting Classifier, specifying the importance of each base classifier
    weights = [0.1, 0.31, 0.28, 0.67]

    # Create a Voting Classifier with the specified base classifiers and weights
    ensemble = VotingClassifier(
        estimators=[
            ('mnb', clf),
            ('sgd', sgd_model),
            ('lgb', lgb),
            ('cat', cat)
        ],
        weights=weights, 
        voting='soft', 
        n_jobs=-1
    )
    return ensemble

# Get the ensemble model
model = get_model()
print(model)

VotingClassifier(estimators=[('mnb', MultinomialNB(alpha=0.0225)),
                             ('sgd',
                              SGDClassifier(loss='modified_huber',
                                            max_iter=9000, random_state=6743,
                                            tol=0.0001)),
                             ('lgb',
                              LGBMClassifier(colsample_bynode=0.8,
                                             colsample_bytree=0.78,
                                             learning_rate=0.00581909898961407,
                                             metric='auc', n_iter=3000,
                                             objective='cross_entropy',
                                             random_state=6743, verbose=-1)),
                             ('cat',
                              <catboost.core.CatBoostClassifier object at 0x31964a680>)],
                 n_jobs=-1, voting='soft', weights=[0.1, 0.31, 0.28, 0.67])


In [27]:
# Check if the number of test samples is less than or equal to 5
if len(test.text.values) <= 5:
    # If true, generate a sample submission file
    sub.to_csv('submission.csv', index=False)
else:
    # If false, fit the ensemble model on the training data
    model.fit(tf_train, y_train)

    # Predict probabilities for the test data
    final_preds = model.predict_proba(tf_test)[:, 1]

    # Add the generated probabilities to the submission DataFrame
    sub['generated'] = final_preds
    
    print(sub['generated'].max(), sub['generated'].min(),sub['generated'].mean())
    
    # Save the submission file without index column
    sub.to_csv('submission.csv', index=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.08836230134556505 0.03208475226377537 0.05951116483354726


In [29]:
sub

Unnamed: 0,id,generated
0,ca99f3b0,0.054075
1,cab59ae3,0.088362
2,cad31d61,0.082673
3,cb2a22f1,0.055237
4,cba3c89f,0.054217
5,cbc48dd7,0.083371
6,cc01c188,0.032085
7,cc1dca35,0.04572
8,cd3b5043,0.05776
9,cd53e658,0.041612
