In [1]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"

In [2]:
import sklearn
import lightgbm

print(sklearn.__version__)
print(lightgbm.__version__)

1.2.2
3.3.2


In [3]:
test = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/test_essays.csv')
sub = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/sample_submission.csv')
org_train = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/train_essays.csv')

In [4]:
train = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/daigt/train_v2_drcat_02.csv', sep=',')
display(train.shape);

(44868, 5)

#### Read and append the generated data

In [5]:
train_gen_data = pd.read_parquet('/home/peng_sun2/s3shared/kaggle/llm-2023/external_data/gen_data_21122023.parquet')
display(train_gen_data.shape)

(18894, 5)

In [6]:
train = train.drop_duplicates(subset = ['text'])
train.reset_index(drop = True, inplace = True)

### append the generated data
# train = pd.concat([train, train_gen_data], axis = 0).reset_index(drop = True)
# display(train.shape)

In [7]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [8]:
## Create byte-pair encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token='[UNK]'))

In [9]:
## adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()]+ [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

In [10]:
# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size = VOCAB_SIZE, special_tokens = special_tokens)

In [11]:
# creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter():
    for i in range(0, len(dataset), 1000):
        yield dataset[i:i+1000]['text']
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer = trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  if _pandas_api.is_sparse(col):


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [12]:
tokenized_texts_test[1]

['ĠBbb', 'Ġccc', 'Ġddd', '.']

In [13]:
def dummy(text):
    return text

In [14]:
vectorizer = TfidfVectorizer(ngram_range = (3, 5), lowercase=False, sublinear_tf = True, analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None,
                            strip_accents = 'unicode')
vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, 
                             #vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)
## get the vocabulary of the training dataset
train_vocab = vectorizer.vocabulary_

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


21

In [15]:
type(train_vocab)

dict

In [16]:
### save train_vocab
import joblib
model_path = '/home/peng_sun2/s3shared/kaggle/llm-2023/baseline1/'
joblib.dump(train_vocab, f'{model_path}vocab_train.pkl')

['/home/peng_sun2/s3shared/kaggle/llm-2023/baseline1/vocab_train.pkl']

In [17]:
y_train = train['label'].values

In [18]:


# # Check if it's in scoring stage
# if len(test.text.values) <= 5:
#     # if not, just sample submission
#     sub.to_csv('submission.csv', index=False)
# else:
# otherwise, run fitting process
clf = MultinomialNB(alpha=0.02)
clf2 = MultinomialNB(alpha=0.01)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
p6={
    'n_iter': 1500,
    'verbose': -1,
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05073909898961407, 
    'colsample_bytree': 0.726023996436955, 
    'colsample_bynode': 0.5803681307354022, 
    'lambda_l1': 8.562963348932286, 
    'lambda_l2': 4.893256185259296, 
    'min_data_in_leaf': 115, 
    'max_depth': 123, 
    'max_bin': 900,
    #'device': 'gpu'
}
lgb=LGBMClassifier(**p6)

from catboost import CatBoostClassifier

cat=CatBoostClassifier(
    iterations=1000,
    verbose=0,
    l2_leaf_reg=6.6591278779517808,
    learning_rate=0.005689066836106983,
    allow_const_label=True
)
#weights = [0.1, 0.45, 0.45, 0.45]
weights = [0.1]
weights = [w/sum(weights) for w in weights]

ensemble = VotingClassifier(
    estimators=[
#       ('mnb',clf),
        ('sgd', sgd_model),
#          ('lgb',lgb), 
#         ('cat', cat)
    ],
    weights=weights, 
    voting='soft', n_jobs=-1)
    
    
    

In [19]:
import joblib 
ensemble.fit(tf_train, y_train )
## output the models
model_path = '/home/peng_sun2/s3shared/kaggle/llm-2023/baseline1/'
# joblib.dump(clf, f'{model_path}clf.pkl');
# joblib.dump(sgd_model, f'{model_path}sgd_model.pkl')
# joblib.dump(lgb, f'{model_path}lgb.pkl')
# joblib.dump(cat, f'{model_path}cat.pkl')
joblib.dump(ensemble, f'{model_path}ensemble.pkl');

[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


LightGBMError: GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1

In [None]:
ensemble

In [None]:
# ensemble.predict(tf_train[0:10, :])

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
## cv split
skf = StratifiedKFold(n_splits = 5)
skf.get_n_splits(tf_train, y_train)

aus_list = []
for i, (train_idx, test_idx) in enumerate(skf.split(tf_train, y_train)):
    train_X = tf_train[train_idx]
    eval_X = tf_train[test_idx];
    train_y = y_train[train_idx]
    eval_y = y_train[test_idx];
    
    print(train_X.shape)
    ensemble.fit(train_X, train_y)
    joblib.dump(clf, f'{model_path}clf_cv_{i}.pkl');
    joblib.dump(sgd_model, f'{model_path}sgd_model_cv_{i}.pkl')
    joblib.dump(lgb, f'{model_path}lgb_cv_{i}.pkl')
    joblib.dump(cat, f'{model_path}cat_cv_{i}.pkl')
    joblib.dump(ensemble, f'{model_path}ensemble_cv_{i}.pkl');
    eval_preds = ensemble.predict_proba(eval_X)[:,1]
    #eval_preds = ensemble.predict(eval_X)
    print('eval preds:');
    print(eval_preds)
    ## compute AUC
    fpr, tpr, thresholds = metrics.roc_curve(eval_y, eval_preds, pos_label = 1)
    print(fpr, tpr)
    print(metrics.auc(fpr, tpr))
    aus_list.append(metrics.auc(fpr, tpr))
    #break;
    

(35894, 20784)
eval preds:
[0.         0.         0.06913218 ... 1.         1.         1.        ]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.82648402e-04
 1.82648402e-04 3.65296804e-04 3.65296804e-04 5.47945205e-04
 5.47945205e-04 7.30593607e-04 7.30593607e-04 9.13242009e-04
 9.13242009e-04 1.09589041e-03 1.09589041e-03 1.27853881e-03
 1.27853881e-03 1.46118721e-03 1.46118721e-03 1.64383562e-03
 1.64383562e-03 1.82648402e-03 1.82648402e-03 2.00913242e-03
 2.00913242e-03 2.19178082e-03 2.19178082e-03 2.37442922e-03
 2.37442922e-03 2.55707763e-03 2.55707763e-03 2.73972603e-03
 2.73972603e-03 2.92237443e-03 2.92237443e-03 3.10502283e-03
 3.10502283e-03 3.28767123e-03 3.28767123e-03 3.47031963e-03
 3.47031963e-03 3.65296804e-03 3.65296804e-03 3.83561644e-03
 3.83561644e-03 4.20091324e-03 4.20091324e-03 4.38356164e-03
 4.38356164e-03 4.56621005e-03 4.56621005e-03 4.74885845e-03
 4.74885845e-03 4.93150685e-03 4.93150685e-03 5.11415525e-03
 5.11415525e-03 5.47945205e-03 5.47945205e-03 5

(35894, 20784)
eval preds:
[0.20180468 0.         0.22394656 ... 1.         1.         1.        ]
[0.00000000e+00 1.82681768e-04 1.82681768e-04 3.65363537e-04
 3.65363537e-04 5.48045305e-04 5.48045305e-04 7.30727073e-04
 7.30727073e-04 9.13408842e-04 9.13408842e-04 1.09609061e-03
 1.09609061e-03 1.27877238e-03 1.27877238e-03 1.46145415e-03
 1.46145415e-03 1.82681768e-03 1.82681768e-03 2.00949945e-03
 2.00949945e-03 2.19218122e-03 2.19218122e-03 2.37486299e-03
 2.37486299e-03 2.55754476e-03 2.55754476e-03 2.74022653e-03
 2.74022653e-03 3.10559006e-03 3.10559006e-03 3.28827183e-03
 3.28827183e-03 3.47095360e-03 3.47095360e-03 3.65363537e-03
 3.65363537e-03 4.01899890e-03 4.01899890e-03 4.20168067e-03
 4.20168067e-03 4.56704421e-03 4.56704421e-03 4.74972598e-03
 4.74972598e-03 4.93240775e-03 4.93240775e-03 5.11508951e-03
 5.11508951e-03 5.29777128e-03 5.29777128e-03 5.48045305e-03
 5.48045305e-03 5.66313482e-03 5.66313482e-03 5.84581659e-03
 5.84581659e-03 6.02849836e-03 6.02849836e-03 6

(35895, 20784)
eval preds:
[0. 0. 0. ... 1. 1. 1.]
[0.00000000e+00 1.82681768e-04 1.82681768e-04 3.65363537e-04
 3.65363537e-04 5.48045305e-04 5.48045305e-04 7.30727073e-04
 7.30727073e-04 9.13408842e-04 9.13408842e-04 1.09609061e-03
 1.09609061e-03 1.27877238e-03 1.27877238e-03 1.46145415e-03
 1.46145415e-03 1.64413592e-03 1.64413592e-03 1.82681768e-03
 1.82681768e-03 2.00949945e-03 2.00949945e-03 2.19218122e-03
 2.19218122e-03 2.37486299e-03 2.37486299e-03 2.55754476e-03
 2.55754476e-03 2.74022653e-03 2.74022653e-03 2.92290829e-03
 2.92290829e-03 3.28827183e-03 3.28827183e-03 3.47095360e-03
 3.47095360e-03 3.65363537e-03 3.65363537e-03 3.83631714e-03
 3.83631714e-03 4.01899890e-03 4.01899890e-03 4.20168067e-03
 4.20168067e-03 4.56704421e-03 4.56704421e-03 5.11508951e-03
 5.11508951e-03 5.29777128e-03 5.29777128e-03 5.84581659e-03
 5.84581659e-03 6.02849836e-03 6.02849836e-03 6.57654366e-03
 6.57654366e-03 7.12458897e-03 7.12458897e-03 7.48995250e-03
 7.48995250e-03 7.67263427e-03 7.6

In [22]:
import sklearn 
import lightgbm
display(sklearn.__version__)
display(lightgbm.__version__)

'1.2.2'

'3.3.2'

In [20]:
# experiment with only sgb classifier, original parameter setting
aus_list

[0.9966875336854236,
 0.9941866485724724,
 0.9953935487238375,
 0.997091005593435,
 0.9468253521570912]

In [20]:
# experiment with only catboost, original parameter setting
aus_list

[0.9951818197240959,
 0.988915444438645,
 0.9920609113210502,
 0.9955969986936087,
 0.9605266936228868]

In [21]:
# experiment with only lightgbm 4, with cpu max bin 900 and max_depth 123
aus_list

[0.9978818736207736,
 0.9945432433843102,
 0.9961483897906989,
 0.9974760260852231,
 0.9665304445771498]



In [20]:
# experiment with only lightgbm 3
aus_list

[0.9633644054857161,
 0.9755928284357223,
 0.9867114150007829,
 0.9750459523745131,
 0.7752189335791227]

In [21]:
#experiment with only lightgbm2, depth 23
aus_list

[0.9980322623163044,
 0.994754319118952,
 0.9964657341197348,
 0.9976234663006696,
 0.9670913856801093]

In [20]:
#experiment with only lightgbm, max_bin 250, max-depth = 123
aus_list

[0.9980014642148245,
 0.9947916383944883,
 0.9964219426901196,
 0.9976602219351152,
 0.9670607907912099]

In [22]:
#experiment wiht only lightgbm, max_bin 200
aus_list

[0.9979935819888527,
 0.9949521895714809,
 0.9962090923325854,
 0.997664973018545,
 0.966284693481503]

In [21]:
#experiment without catboost
aus_list

[0.9974415651699572,
 0.9950194686570281,
 0.996574873427632,
 0.9978453575597517,
 0.9644704583375404]



In [23]:
aus_list

[0.9976213425623237,
 0.9957595907928388,
 0.9970352314838978,
 0.9979617852086347,
 0.9608195900848752]

In [19]:
## with external data
aus_list

[0.9948483859465849,
 0.9940963507093242,
 0.9947070055108838,
 0.9951455995069962,
 0.9753977531773758]

In [19]:
## without external data
aus_list

[0.9974293503297094,
 0.9950473406754007,
 0.9966961219270317,
 0.9978349678278557,
 0.9645059087292857]