In [1]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"

In [2]:
import sklearn
import lightgbm

print(sklearn.__version__)
print(lightgbm.__version__)

1.2.2
3.3.2


In [3]:
test = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/test_essays.csv')
sub = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/sample_submission.csv')
org_train = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/train_essays.csv')

In [4]:
train = pd.read_csv('/home/peng_sun2/s3shared/kaggle/llm-2023/data/daigt/train_v2_drcat_02.csv', sep=',')
display(train.shape);

(44868, 5)

#### Read and append the generated data

In [5]:
train_gen_data = pd.read_parquet('/home/peng_sun2/s3shared/kaggle/llm-2023/external_data/gen_data_21122023.parquet')
display(train_gen_data.shape)

(18894, 5)

In [6]:
train = train.drop_duplicates(subset = ['text'])
train.reset_index(drop = True, inplace = True)

### append the generated data
# train = pd.concat([train, train_gen_data], axis = 0).reset_index(drop = True)
# display(train.shape)

In [7]:
train_0 = train.loc[train['label'] ==0, :].reset_index(drop = True)
train_1 = train.loc[train['label'] ==1, :].reset_index(drop = True)

In [8]:
train_0.shape

(27371, 5)

In [9]:
train_1.shape

(17497, 5)

In [10]:
# min_length = min(len(train_0), len(train_1))
# min_length

In [11]:
# sorted_train= pd.DataFrame()
# for i in tqdm(range(min_length), total = min_length):
#     sorted_train = pd.concat([sorted_train, pd.DataFrame(train_0.iloc[i,:]).T], axis = 0)
#     sorted_train = pd.concat([sorted_train, pd.DataFrame(train_1.iloc[i,:]).T], axis = 0)

In [12]:
# if len(train_0) > min_length:
#     sorted_train = pd.concat([sorted_train, train_0.iloc[min_length:, :]], axis = 0)
# elif len(train_1) > min_length:
#     sorted_train = pd.concat([sorted_train, train_1.iloc[min_length:, :]], axis = 0)

In [13]:
from tqdm import tqdm 
def sort_train(train):
    train_0 = train.loc[train['label'] ==0, :].reset_index(drop = True)
    train_1 = train.loc[train['label'] ==1, :].reset_index(drop = True)
    
    min_length = min(len(train_0), len(train_1))
    
    sorted_train= pd.DataFrame()
    for i in tqdm(range(min_length), total = min_length):
        sorted_train = pd.concat([sorted_train, pd.DataFrame(train_0.iloc[i,:]).T], axis = 0)
        sorted_train = pd.concat([sorted_train, pd.DataFrame(train_1.iloc[i,:]).T], axis = 0)
    
    if len(train_0) > min_length:
        sorted_train = pd.concat([sorted_train, train_0.iloc[min_length:, :]], axis = 0)
    elif len(train_1) > min_length:
        sorted_train = pd.concat([sorted_train, train_1.iloc[min_length:, :]], axis = 0)
        
    del train_0, train_1, train
    sorted_train['label'] = sorted_train['label'].astype(np.int32)
    return sorted_train.reset_index(drop = True)

In [14]:
display(train.shape)
train.head(10)

(44868, 5)

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False
5,Cell phone use should not be legal while drivi...,0,Phones and driving,persuade_corpus,False
6,Phones and Driving\n\nDriving is a good way to...,0,Phones and driving,persuade_corpus,False
7,PHONES AND DRIVING\n\nIn this world in which w...,0,Phones and driving,persuade_corpus,False
8,People are debating whether if drivers should ...,0,Phones and driving,persuade_corpus,False
9,Texting and driving\n\nOver half of drivers in...,0,Phones and driving,persuade_corpus,False


In [15]:
train = sort_train(train)

100%|██████████| 17497/17497 [00:45<00:00, 384.15it/s]


In [16]:
display(train.shape)
train.head(10)

(44868, 5)

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,"In recent years, technology has had a profoun...",1,Car-free cities,mistral7binstruct_v2,True
2,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
3,I strongly believe that meditation and mindful...,1,Distance learning,llama_70b_v1,False
4,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
5,One way school administrators can attempt to c...,1,Cell phones at school,chat_gpt_moth,False
6,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
7,While summer is meant as a break from the regu...,1,Summer projects,darragh_claude_v7,False
8,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False
9,The use of Facial Action Coding System (FACS) ...,1,Facial action coding system,darragh_claude_v6,True


In [17]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [18]:
## Create byte-pair encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token='[UNK]'))

In [19]:
## adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()]+ [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

In [20]:
# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size = VOCAB_SIZE, special_tokens = special_tokens)

In [21]:
# creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter():
    for i in range(0, len(dataset), 1000):
        yield dataset[i:i+1000]['text']
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer = trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

  if _pandas_api.is_sparse(col):





100%|██████████| 3/3 [00:00<00:00, 2619.80it/s]






100%|██████████| 44868/44868 [02:06<00:00, 353.58it/s]


In [22]:
tokenized_texts_test[1]

['ĠBbb', 'Ġccc', 'Ġddd', '.']

In [23]:
def dummy(text):
    return text

In [24]:
vectorizer = TfidfVectorizer(ngram_range = (3, 5), lowercase=False, sublinear_tf = True, analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None,
                            strip_accents = 'unicode',
                            dtype = np.float32)
vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, 
                             #vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode',
                            dtype = np.float32
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)
## get the vocabulary of the training dataset
train_vocab = vectorizer.vocabulary_

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


0

### Convert the type from numpy.float64 to numpy.float16

In [25]:
# import numpy as np
# from scipy.sparse import csr_matrix

# ## convert data for tf_train
# data  = tf_train.data.astype(np.float16)
# indices = tf_train.indices
# indptr = tf_train.indptr
# tf_train = csr_matrix((data, indices, indptr), shape = tf_train.shape)


# ## convert data for tf_test
# data = tf_test.data.astype(np.float16)
# indices = tf_test.indices
# indptr = tf_test.indptr
# tf_test = csr_matrix((data, indices, indptr), shape = tf_test.shape)

# del data, indices, indptr
# gc.collect()

## Start training

In [26]:
### save train_vocab
import joblib
model_path = '/home/peng_sun2/s3shared/kaggle/llm-2023/baseline1/'
joblib.dump(train_vocab, f'{model_path}vocab_train.pkl')

['/home/peng_sun2/s3shared/kaggle/llm-2023/baseline1/vocab_train.pkl']

In [27]:
y_train = train['label'].values

In [28]:


# # Check if it's in scoring stage
# if len(test.text.values) <= 5:
#     # if not, just sample submission
#     sub.to_csv('submission.csv', index=False)
# else:
# otherwise, run fitting process
clf = MultinomialNB(alpha=0.02)
clf2 = MultinomialNB(alpha=0.01)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
p6={
    'n_iter': 1500,
    'verbose': -1,
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05073909898961407, 
    'colsample_bytree': 0.726023996436955, 
    'colsample_bynode': 0.5803681307354022, 
    'lambda_l1': 8.562963348932286, 
    'lambda_l2': 4.893256185259296, 
    'min_data_in_leaf': 115, 
    'max_depth': 123, 
    'max_bin': 250,
    'device': 'gpu'
}
lgb=LGBMClassifier(**p6)

from catboost import CatBoostClassifier

cat=CatBoostClassifier(
    iterations=1000,
    verbose=0,
    l2_leaf_reg=6.6591278779517808,
    learning_rate=0.005689066836106983,
    allow_const_label=True,
    #used_ram_limit="2048MB"
)
weights = [0.45, 0.45, 0.3, 0.45]
#weights = [0.1]
weights = [w/sum(weights) for w in weights]

# ensemble = VotingClassifier(
#     estimators=[
#       ('mnb',clf),
# #         ('sgd', sgd_model),
# #          ('lgb',lgb), 
# #         ('cat', cat)
#     ],
#     weights=weights, 
#     voting='soft', n_jobs=1)
    
    
    

In [29]:
# import joblib 
# ensemble.fit(tf_train, y_train )
# ## output the models
# model_path = '/home/peng_sun2/s3shared/kaggle/llm-2023/baseline1/'
# # joblib.dump(clf, f'{model_path}clf.pkl');
# # joblib.dump(sgd_model, f'{model_path}sgd_model.pkl')
# # joblib.dump(lgb, f'{model_path}lgb.pkl')
# # joblib.dump(cat, f'{model_path}cat.pkl')
# joblib.dump(ensemble, f'{model_path}ensemble.pkl');

In [30]:
def split_range_into_parts(start, end, num_parts):
    splits = np.linspace(start, end, num_parts + 1)
    ranges = [(int(np.floor(splits[i])), int(np.ceil(splits[i + 1]))) for i in range(num_parts)]
    return np.array(ranges)

In [31]:
def increment_train_catboost(tf_train, y_train, cat, num_parts):
    split_ranges = split_range_into_parts(0, tf_train.shape[0], num_parts);
    #print(split_ranges)
    
    ## incrementally train catboost
    for i, (start, end) in enumerate(split_ranges):
        print('start, end ', start, ' ', end)
        train_X_split = tf_train[start:end, :]
        train_y_split = y_train[start:end]
        
        #print(train_X_split.shape, train_y_split.shape)
        if i ==0:
            cat.fit(train_X_split, train_y_split)
        else: 
            cat.fit(train_X_split, train_y_split, init_model = cat)
        print(f'part {i} is finished ...')
    return cat
    

In [32]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn import metrics
import lightgbm
## cv split
skf = StratifiedGroupKFold(n_splits = 5)
skf.get_n_splits(tf_train, y_train, groups =train['prompt_name'])

aus_list = []
for i, (train_idx, test_idx) in enumerate(skf.split(tf_train, y_train, groups = train['prompt_name'])):
    train_X = tf_train[train_idx]
    eval_X = tf_train[test_idx];
    train_y = y_train[train_idx]
    eval_y = y_train[test_idx];
    
    print(train_X.shape)
#     ensemble.fit(train_X, train_y)
#     joblib.dump(clf, f'{model_path}clf_cv_{i}.pkl');
#     joblib.dump(sgd_model, f'{model_path}sgd_model_cv_{i}.pkl')
#     joblib.dump(lgb, f'{model_path}lgb_cv_{i}.pkl')
#     joblib.dump(cat, f'{model_path}cat_cv_{i}.pkl')
#     joblib.dump(ensemble, f'{model_path}ensemble_cv_{i}.pkl');
#     eval_preds = ensemble.predict_proba(eval_X)[:,1]
    ## fit each model
    clf.fit(train_X, train_y);
    gc.collect()
    
    sgd_model.fit(train_X, train_y)
    gc.collect()
    
    lgb.fit(train_X, train_y)
    gc.collect()
    
    cat = increment_train_catboost(train_X, train_y, cat, num_parts = 2)
    gc.collect()
    

    eval_preds = clf.predict_proba(eval_X)[:, 1]* weights[0] + sgd_model.predict_proba(eval_X)[:, 1]*weights[1] \
        + lgb.predict_proba(eval_X)[:, 1]*weights[2] + cat.predict_proba(eval_X)[:, 1]*weights[3]
    
    #eval_preds = ensemble.predict(eval_X)
    ## compute AUC
    fpr, tpr, thresholds = metrics.roc_curve(eval_y, eval_preds, pos_label = 1)
    print(fpr, tpr)
    print(metrics.auc(fpr, tpr))
    aus_list.append(metrics.auc(fpr, tpr))
    #break;
    

(36017, 20784)




start, end  0   18009
part 0 is finished ...
start, end  18008   36017
part 1 is finished ...
[0.00000000e+00 0.00000000e+00 0.00000000e+00 2.34631628e-04
 2.34631628e-04 4.69263257e-04 4.69263257e-04 7.03894885e-04
 7.03894885e-04 9.38526513e-04 9.38526513e-04 1.17315814e-03
 1.17315814e-03 1.40778977e-03 1.40778977e-03 1.64242140e-03
 1.64242140e-03 1.87705303e-03 1.87705303e-03 2.11168466e-03
 2.11168466e-03 2.34631628e-03 2.34631628e-03 2.58094791e-03
 2.58094791e-03 2.81557954e-03 2.81557954e-03 3.05021117e-03
 3.05021117e-03 3.28484280e-03 3.28484280e-03 3.51947443e-03
 3.51947443e-03 3.75410605e-03 3.75410605e-03 3.98873768e-03
 3.98873768e-03 4.22336931e-03 4.22336931e-03 4.45800094e-03
 4.45800094e-03 4.69263257e-03 4.69263257e-03 4.92726420e-03
 4.92726420e-03 5.16189582e-03 5.16189582e-03 5.39652745e-03
 5.39652745e-03 5.86579071e-03 5.86579071e-03 6.33505397e-03
 6.33505397e-03 6.56968559e-03 6.56968559e-03 7.03894885e-03
 7.03894885e-03 7.27358048e-03 7.27358048e-03 7.5082



start, end  0   16980
part 0 is finished ...
start, end  16980   33960
part 1 is finished ...
[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.32345156e-04
 1.32345156e-04 2.64690312e-04 2.64690312e-04 3.97035469e-04
 3.97035469e-04 5.29380625e-04 5.29380625e-04 6.61725781e-04
 6.61725781e-04 7.94070937e-04 7.94070937e-04 9.26416093e-04
 9.26416093e-04 1.05876125e-03 1.05876125e-03 1.19110641e-03
 1.19110641e-03 1.32345156e-03 1.32345156e-03 1.45579672e-03
 1.45579672e-03 1.58814187e-03 1.58814187e-03 1.72048703e-03
 1.72048703e-03 1.85283219e-03 1.85283219e-03 1.98517734e-03
 1.98517734e-03 2.11752250e-03 2.11752250e-03 2.24986765e-03
 2.24986765e-03 2.38221281e-03 2.38221281e-03 2.51455797e-03
 2.51455797e-03 2.64690312e-03 2.64690312e-03 2.77924828e-03
 2.77924828e-03 2.91159344e-03 2.91159344e-03 3.04393859e-03
 3.04393859e-03 3.17628375e-03 3.17628375e-03 3.30862890e-03
 3.30862890e-03 3.44097406e-03 3.44097406e-03 3.83800953e-03
 3.83800953e-03 3.97035469e-03 3.97035469e-03 4.1026



start, end  0   18711
part 0 is finished ...
start, end  18710   37421
part 1 is finished ...
[0.00000000e+00 0.00000000e+00 0.00000000e+00 2.67379679e-04
 2.67379679e-04 5.34759358e-04 5.34759358e-04 8.02139037e-04
 8.02139037e-04 1.06951872e-03 1.06951872e-03 1.33689840e-03
 1.33689840e-03 1.60427807e-03 1.60427807e-03 1.87165775e-03
 1.87165775e-03 2.13903743e-03 2.13903743e-03 2.40641711e-03
 2.40641711e-03 2.67379679e-03 2.67379679e-03 3.20855615e-03
 3.20855615e-03 3.47593583e-03 3.47593583e-03 3.74331551e-03
 3.74331551e-03 4.27807487e-03 4.27807487e-03 4.54545455e-03
 4.54545455e-03 4.81283422e-03 4.81283422e-03 5.08021390e-03
 5.08021390e-03 5.88235294e-03 5.88235294e-03 6.68449198e-03
 6.68449198e-03 6.95187166e-03 6.95187166e-03 7.21925134e-03
 7.21925134e-03 7.48663102e-03 7.48663102e-03 7.75401070e-03
 7.75401070e-03 8.02139037e-03 8.02139037e-03 8.28877005e-03
 8.28877005e-03 8.55614973e-03 8.55614973e-03 8.82352941e-03
 8.82352941e-03 9.09090909e-03 9.09090909e-03 9.3582



start, end  0   18675
part 0 is finished ...
start, end  18675   37350
part 1 is finished ...
[0.00000000e+00 0.00000000e+00 0.00000000e+00 2.04876050e-04
 2.04876050e-04 4.09752100e-04 4.09752100e-04 6.14628150e-04
 6.14628150e-04 8.19504200e-04 8.19504200e-04 1.02438025e-03
 1.02438025e-03 1.22925630e-03 1.22925630e-03 1.43413235e-03
 1.43413235e-03 1.63900840e-03 1.63900840e-03 1.84388445e-03
 1.84388445e-03 2.04876050e-03 2.04876050e-03 2.25363655e-03
 2.25363655e-03 2.45851260e-03 2.45851260e-03 2.66338865e-03
 2.66338865e-03 2.86826470e-03 2.86826470e-03 3.07314075e-03
 3.07314075e-03 3.27801680e-03 3.27801680e-03 3.48289285e-03
 3.48289285e-03 3.68776890e-03 3.68776890e-03 3.89264495e-03
 3.89264495e-03 4.09752100e-03 4.09752100e-03 4.30239705e-03
 4.30239705e-03 4.50727310e-03 4.50727310e-03 4.91702520e-03
 4.91702520e-03 5.12190125e-03 5.12190125e-03 5.32677730e-03
 5.32677730e-03 5.73652940e-03 5.73652940e-03 5.94140545e-03
 5.94140545e-03 6.14628150e-03 6.14628150e-03 6.3511



start, end  0   17362
part 0 is finished ...
start, end  17362   34724
part 1 is finished ...
[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.44258511e-04
 1.44258511e-04 2.88517023e-04 2.88517023e-04 4.32775534e-04
 4.32775534e-04 5.77034045e-04 5.77034045e-04 7.21292556e-04
 7.21292556e-04 8.65551068e-04 8.65551068e-04 1.00980958e-03
 1.00980958e-03 1.15406809e-03 1.15406809e-03 1.29832660e-03
 1.29832660e-03 1.44258511e-03 1.44258511e-03 1.58684362e-03
 1.58684362e-03 1.87536065e-03 1.87536065e-03 2.01961916e-03
 2.01961916e-03 2.16387767e-03 2.16387767e-03 2.30813618e-03
 2.30813618e-03 2.45239469e-03 2.45239469e-03 2.59665320e-03
 2.59665320e-03 2.74091171e-03 2.74091171e-03 2.88517023e-03
 2.88517023e-03 3.02942874e-03 3.02942874e-03 3.17368725e-03
 3.17368725e-03 3.31794576e-03 3.31794576e-03 3.46220427e-03
 3.46220427e-03 3.60646278e-03 3.60646278e-03 3.89497980e-03
 3.89497980e-03 4.03923832e-03 4.03923832e-03 4.32775534e-03
 4.32775534e-03 4.47201385e-03 4.47201385e-03 4.6162

In [33]:
import sklearn 
import lightgbm
display(sklearn.__version__)
display(lightgbm.__version__)

'1.2.2'

'3.3.2'

In [34]:
## with stratifiedgroupkfold cv, [0.45, 0.45, 0.3, 0.45], use split catboost
aus_list

[0.9956354120021977,
 0.996720232763228,
 0.9945077170088674,
 0.9800606734556234,
 0.9931625417954453]

In [29]:
## with stratifiedgroupkfold cv
aus_list

[0.9957785735971774,
 0.996703413241591,
 0.9947435766125369,
 0.9798721129044952,
 0.993499609082789]

In [22]:
# experiment with all models, without VotingEnsemble, self-coded soft voting, original parameter setting, with numpy.float32
aus_list

[0.9960866053053645,
 0.9947028028602746,
 0.9967226890756303,
 0.9973018544992708,
 0.9422863445613096]

In [21]:
# experiment with all models, without VotingEnsemble, self-coded soft voting, original parameter setting
aus_list

[0.9960500129847929,
 0.99474659428989,
 0.9968172660368496,
 0.9973122442311666,
 0.9431892070420871]

In [21]:
# experiment with all models, VotingEnsemble n_jobs = 1, original parameters setting, changed weights
aus_list

[0.9960656208362207,
 0.9947012370165457,
 0.9968239469700924,
 0.9973352687124032,
 0.9428879048171079]

In [20]:
# experiment with only multinomialNB, original parameter setting
aus_list

[0.9633644054857161,
 0.9755928284357223,
 0.9867114150007829,
 0.9750459523745131,
 0.7752189335791227]

In [20]:
# experiment with only catboost, original parameter setting
aus_list

[0.9951818197240959,
 0.988915444438645,
 0.9920609113210502,
 0.9955969986936087,
 0.9605266936228868]

In [21]:
# experiment with only lightgbm 4, with cpu max bin 900 and max_depth 123
aus_list

[0.9978818736207736,
 0.9945432433843102,
 0.9961483897906989,
 0.9974760260852231,
 0.9665304445771498]



In [20]:
# experiment with only lightgbm 3
aus_list

[0.9633644054857161,
 0.9755928284357223,
 0.9867114150007829,
 0.9750459523745131,
 0.7752189335791227]

In [21]:
#experiment with only lightgbm2, depth 23
aus_list

[0.9980322623163044,
 0.994754319118952,
 0.9964657341197348,
 0.9976234663006696,
 0.9670913856801093]

In [20]:
#experiment with only lightgbm, max_bin 250, max-depth = 123
aus_list

[0.9980014642148245,
 0.9947916383944883,
 0.9964219426901196,
 0.9976602219351152,
 0.9670607907912099]

In [22]:
#experiment wiht only lightgbm, max_bin 200
aus_list

[0.9979935819888527,
 0.9949521895714809,
 0.9962090923325854,
 0.997664973018545,
 0.966284693481503]

In [21]:
#experiment without catboost
aus_list

[0.9974415651699572,
 0.9950194686570281,
 0.996574873427632,
 0.9978453575597517,
 0.9644704583375404]



In [23]:
aus_list

[0.9976213425623237,
 0.9957595907928388,
 0.9970352314838978,
 0.9979617852086347,
 0.9608195900848752]

In [19]:
## with external data
aus_list

[0.9948483859465849,
 0.9940963507093242,
 0.9947070055108838,
 0.9951455995069962,
 0.9753977531773758]

In [19]:
## without external data
aus_list

[0.9974293503297094,
 0.9950473406754007,
 0.9966961219270317,
 0.9978349678278557,
 0.9645059087292857]