In [60]:
import pandas as pd
import spacy
import random
import joblib
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

nlp = spacy.load("en_core_web_sm")

# EDA

In [2]:
df = pd.read_csv('../data/dataset.csv', delimiter=';')
df.head()

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  17879 non-null  object
 1   fraudulent   17880 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 279.5+ KB


In [4]:
df['fraudulent'].value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [5]:
df['fraudulent'].value_counts(normalize=True)

fraudulent
0    0.951566
1    0.048434
Name: proportion, dtype: float64

## Thoughts and hypothesis

- Binary classification
- Very unbalanced! 95% are real vs 5% fake. Need to be careful when we assess quality of models + training the data (should I upsample the fake ones? Or downsample the good ones). 
- Also for testing, good to actually have a test set of 50-50.
- Accuracy is not a good measure.
- What could differientiate real vs fake ads? Special words (scam words), length of text, external links, more words in uppercase (so careful if preprocessing and putting all to lowercase)

## To Do

MVP
- Clean dataset: duplicates, null values
- Unbalanced dataset: think of ways to 'fix' this issue -> oversampling, undersampling
- Quick check of fraudulent ads to see if anything obvious (length of text, scam words, ...)
- Check if language is English for all, since we ll use english language to clean
- Create a baseline model before any preprocessing and feature engineering -> save model
- Create API
- Create docker image
- Test API

V2
- Remove common english words, punctuation, ...
- Possible feature engineering
- Train different models (simple to more complex). Use pretrained embeddings? BERT? Or 'simple' ones?
- Test not only using accuracy as metrics -> F1, recall, precision, ... Think of what's important in real life and use correct metric
- Redeploy best model to API

# Data cleaning

In [6]:
df[df['description'].isnull()]

Unnamed: 0,description,fraudulent
17513,,1


In [7]:
df.dropna(subset=['description'], inplace=True)

In [8]:
df[df.duplicated(subset=['description'], keep=False)].sort_values('description')

Unnamed: 0,description,fraudulent
14118,"""Pride is a personal commitment. It is an att...",0
4193,"""Pride is a personal commitment. It is an att...",0
13528,"""We take great care of our CarePartners so the...",0
12007,"""We take great care of our CarePartners so the...",0
11806,"""We take great care of our CarePartners so the...",0
...,...,...
4520,•Prepares source data for computer entry by co...,0
9992,• Answering incoming calls and securing ord...,0
9196,• Answering incoming calls and securing ord...,0
15441,"￼￼Create, maintain and adjust portfolio of ass...",0


In [9]:
df.drop_duplicates(subset=['description'], inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14801 entries, 0 to 17878
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  14801 non-null  object
 1   fraudulent   14801 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 346.9+ KB


In [11]:
df['fraudulent'].value_counts()

fraudulent
0    14170
1      631
Name: count, dtype: int64

In [12]:
df['fraudulent'].value_counts(normalize=True)

fraudulent
0    0.957368
1    0.042632
Name: proportion, dtype: float64

# Check of fraudulent ads vs real ads

In [13]:
fraud_df = df[df['fraudulent'] == 1]
random_row = random.randint(0, len(fraud_df) - 1)
fraud_df.iloc[random_row]['description']

'Position : Sr IT Solution Owner—SAPJob Type : Full TimeJob Location : United States-Washington–BellevueUs work status required : H1B ,EAD ,Green Card,US Citizens \xa0Qualifications:\xa05+ years of IT solution leadership experienceExperience working with SAP, Ariba and OpenText solutionsExperience with Source-to-Pay business operations and processesAbility to work independently and support multiple projects while meeting tight deadlinesProven ability to solve problems creatively and resourcefully.Excellent organizational and time-management skills.Proficient with MS Office Programs and aptitude for learning new software quicklyProven record of demonstrating proper judgment in managing difficult situations.Expert communicator who can lead and manage senior level relationships across organizational boundaries.Understanding of the way in which IT teams need to interact with business teams in order to make both successful.\xa0 Experience in developing senior-level relationships within the 

In [17]:
vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')  
X = vectorizer.fit_transform(fraud_df['description'].dropna())
ngram_counts = X.sum(axis=0)  # sum along rows (documents)
ngram_freq = [(ngram, ngram_counts[0, idx]) for ngram, idx in vectorizer.vocabulary_.items()]
ngram_freq = sorted(ngram_freq, key=lambda x: x[1], reverse=True)
ngram_freq_df = pd.DataFrame(ngram_freq, columns=["ngram", "count"])
ngram_freq_df.head(20)

Unnamed: 0,ngram,count
0,customer service,127
1,oil gas,115
2,gas industry,94
3,oil gas industry,88
4,aker solutions,87
5,data entry,79
6,experience required,56
7,communication skills,55
8,team members,46
9,years experience,46


In [15]:
X = vectorizer.fit_transform(df[df['fraudulent']==0]['description'].dropna())
ngram_counts = X.sum(axis=0)
ngram_freq = [(ngram, ngram_counts[0, idx]) for ngram, idx in vectorizer.vocabulary_.items()]
ngram_freq = sorted(ngram_freq, key=lambda x: x[1], reverse=True)
ngram_freq_df = pd.DataFrame(ngram_freq, columns=["ngram", "count"])
ngram_freq_df.head(20)

Unnamed: 0,ngram,count
0,customer service,2070
1,social media,1385
2,join team,975
3,team members,945
4,fast paced,891
5,ideal candidate,884
6,communication skills,808
7,responsibilities include,775
8,day day,772
9,fast growing,750


### Quick notes
- Not all fraudulent look obvious, but some do
- Words used: free time, cash, today, no experience needed, from home, day/daily, extra, urgent, anytime, easy, easily
- Look at bigrams/trigrams, can see difference. In fake, lot of focus on specific sector/industry (oil, gas, ...). Real more focused on team, work together, ...

### Examples

- 'Cash In Hand Job (Urgent Staff Required)No Experience Required And Never Any Fees.Work Anytime 1 To 2 Hrs Daily In Free Time.Earn Easily $400 To $500 Extra Per Day.Totally Free To Join &amp; Suitable For All.Take Action &amp; Get Started Today.Please contact us.'
- We are looking for inbound call representativesWe provide complete training...We pay Daily!$20.00 to $200.00 plus a dayGive us a call TODAY
- 'customer service reps needed asap\xa0'
- 'We have several openings available in this area earning $1000.00-$2500.00 per week.\xa0We are seeking only honest, self-motivated people with a desire to work in the home typing and data entry field, from the comfort of their own homes.The preferred applicants should be at least 18 years old with Internet access. No experience is needed. However the following skills are desirable: \xa0'
- 'Student Positions Part-Time and Full-Time.You can do it all from home, in your free time, at your own place.Spend 30 minutes or 1 hours a day &amp; Get biggest cash.You can work in the morning, afternoon, or at night.Perfect for everyone then start immediately.Can earn $350 to $450 extra per day.No any experience required.Zero start-up fee, Visit here:-#URL_7ebe37f71633be1b80547d6f213cb0075a63b6ced35281bfa5c067b5c685f04c#-rg.info'

In [100]:
real_df = df[df['fraudulent'] == 0]
random_row = random.randint(0, len(real_df) - 1)
real_df.iloc[random_row]['description']

"InVision\xa0is the world’s leading design collaboration platform.We enable companies of all sizes to discover the power of design-driven product development. \xa0That’s why designers, product managers, marketers and other stakeholders at so many of the\xa0world's most loved designers, agencies, &amp; corporations\xa0use InVision every day,\xa0including Zappos, Evernote, AirBnB, and Yammer.Built to foster collaboration and iteration, InVision helps our clients design, review and user-test a product before writing a single line of code, with tools for design prototyping, feedback, task management and version control.We’re well-funded and venture-backed by prominent investors including FirstMark Capital and Tiger Global Management.We're looking for an Online Marketing Manager with experience in a freemium SaaS company to manage all of our online user acquisition efforts."

In [101]:
real_df['description'].str.split().str.len().mean()

178.94015525758644

In [102]:
real_df['description'].str.split().explode().value_counts().head(10)

description
and     146656
to       80037
the      75876
of       56884
a        52357
in       42808
for      37356
with     33846
is       24234
our      23509
Name: count, dtype: int64

In [103]:
# Check for average number of words
fraud_df['description'].str.split().str.len().mean()

NameError: name 'fraud_df' is not defined

In [None]:
# Check for most common words
fraud_df['description'].str.split().explode().value_counts().head(10)

description
and     6348
to      3017
the     3012
of      2153
a       1706
in      1657
for     1488
with    1295
is       824
are      702
Name: count, dtype: int64

# MVP

In [18]:
# No preprocessing at all
X = df['description']
y = df['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.89      0.93      4252
           1       0.21      0.70      0.33       189

    accuracy                           0.88      4441
   macro avg       0.60      0.79      0.63      4441
weighted avg       0.95      0.88      0.91      4441



In [None]:
# Save the baseline model
# joblib.dump(vectorizer, '../models/baseline_vectorizer.pkl')
# joblib.dump(model, '../models/baseline_model.pkl')

# V2

In [19]:
def preprocess_text(text: str) -> str:
    text = str(text).lower()
    text = text.replace('.', '. ')
    text = text.replace('no experience', 'zero experience')
    doc = nlp(text)
    cleaned_tokens = [
        token.lemma_.lower() 
        for token in doc 
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

In [22]:
def calculate_uppercase_ratio(text: str) -> float:
    text = str(text)
    words = text.split()
    if words:
        uppercase_words = sum(1 for word in words if word.isupper() and re.search('[A-Z]', word))
        ratio = uppercase_words / len(words)
    else:
        ratio = 0
    return ratio

def count_uppercase_words(text: str) -> int:
    text = str(text)
    words = text.split()
    if words:
        uppercase_words = sum(1 for word in words if word.isupper() and re.search('[A-Z]', word))
    else:
        uppercase_words = 0
    return uppercase_words

def calculate_digit_ratio(text: str) -> float:
    text = str(text)
    if text:
        num_digits = sum(c.isdigit() for c in text)
        digit_ratio = num_digits / len(text)
    else:
        digit_ratio = 0
    return digit_ratio

def calculate_special_char_ratio(text) -> float:
    text = str(text)
    if text:
        special_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
        special_char_ratio = special_chars / len(text)
    else:
        special_char_ratio = 0
    return special_char_ratio

def count_scam_words(text: str) -> int:
    text = str(text)
    scam_words = [
        'free time', 'cash', 'today', 'no experience', 'zero experience', 
        'from home', 'day', 'daily', 'extra', 'urgent', 'anytime', 'easy', 'easily']
    n_scam_words = sum(text.lower().count(word) for word in scam_words)
    return n_scam_words

In [23]:
# Feature engineering before any preprocessing
df['word_count'] = df['description'].str.split().str.len()
df['char_count'] = df['description'].str.len()
df['uppercase_ratio'] = df['description'].apply(calculate_uppercase_ratio)
df['n_uppercase_words'] = df['description'].apply(count_uppercase_words)
df['digit_ratio'] = df['description'].apply(calculate_digit_ratio)
df['special_char_ratio'] = df['description'].apply(calculate_special_char_ratio)
df['scam_words'] = df['description'].apply(count_scam_words)

In [24]:
df.head()

Unnamed: 0,description,fraudulent,word_count,char_count,uppercase_ratio,n_uppercase_words,digit_ratio,special_char_ratio,scam_words
0,"Food52, a fast-growing, James Beard Award-winn...",0,124,905,0.008065,1,0.00663,0.026519,2
1,Organised - Focused - Vibrant - Awesome!Do you...,0,315,2077,0.003175,1,0.031295,0.026481,1
2,"Our client, located in Houston, is actively se...",0,50,355,0.02,1,0.0,0.019718,0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0,346,2600,0.020231,7,0.001154,0.015385,2
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0,168,1520,0.059524,10,0.0,0.025,1


In [25]:
df[df['fraudulent'] == 1].describe()

Unnamed: 0,fraudulent,word_count,char_count,uppercase_ratio,n_uppercase_words,digit_ratio,special_char_ratio,scam_words
count,631.0,631.0,631.0,631.0,631.0,631.0,631.0,631.0
mean,1.0,164.637084,1218.326466,0.021908,3.671949,0.026666,0.028006,0.998415
std,0.0,142.7905,1057.00981,0.043302,8.467219,0.045871,0.013188,1.895273
min,1.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,65.0,447.0,0.0,0.0,0.0,0.020104,0.0
50%,1.0,122.0,888.0,0.008929,1.0,0.003568,0.025522,0.0
75%,1.0,227.5,1660.5,0.026228,4.0,0.033791,0.032633,1.0
max,1.0,1183.0,8578.0,0.441176,127.0,0.270492,0.090909,11.0


In [26]:
df[df['fraudulent'] == 0].describe()

Unnamed: 0,fraudulent,word_count,char_count,uppercase_ratio,n_uppercase_words,digit_ratio,special_char_ratio,scam_words
count,14170.0,14170.0,14170.0,14170.0,14170.0,14170.0,14170.0,14170.0
mean,0.0,178.940155,1272.565632,0.016362,2.537756,0.012251,0.024554,0.665914
std,0.0,123.517973,905.621575,0.03142,3.866302,0.028011,0.012036,1.288407
min,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,96.0,671.0,0.0,0.0,0.0,0.018101,0.0
50%,0.0,153.0,1086.0,0.008817,1.0,0.001177,0.022901,0.0
75%,0.0,233.0,1645.0,0.020747,3.0,0.008475,0.02895,1.0
max,0.0,2115.0,14907.0,1.0,113.0,0.404959,0.333333,19.0


In [27]:
df['description_cleaned'] = df['description'].apply(preprocess_text)

In [28]:
df_preprocessed = df.copy()
df_preprocessed.head()

Unnamed: 0,description,fraudulent,word_count,char_count,uppercase_ratio,n_uppercase_words,digit_ratio,special_char_ratio,scam_words,description_cleaned
0,"Food52, a fast-growing, James Beard Award-winn...",0,124,905,0.008065,1,0.00663,0.026519,2,fast grow james beard award win online food co...
1,Organised - Focused - Vibrant - Awesome!Do you...,0,315,2077,0.003175,1,0.031295,0.026481,1,organise focus vibrant passion customer servic...
2,"Our client, located in Houston, is actively se...",0,50,355,0.02,1,0.0,0.019718,0,client locate houston actively seek experience...
3,THE COMPANY: ESRI – Environmental Systems Rese...,0,346,2600,0.020231,7,0.001154,0.015385,2,company esri environmental system research ins...
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0,168,1520,0.059524,10,0.0,0.025,1,job title itemization review managerlocation f...


## Models training

### Logistic Regression

In [30]:
# Only description_cleaned and fraudulent columns and ngram (1,2)
X = df_preprocessed['description_cleaned']
y = df_preprocessed['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1,2)
    )
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.89      0.93      4252
           1       0.22      0.72      0.34       189

    accuracy                           0.88      4441
   macro avg       0.60      0.80      0.64      4441
weighted avg       0.95      0.88      0.91      4441



In [31]:
# Only description_cleaned and fraudulent columns and ngram (2,3)
X = df_preprocessed['description_cleaned']
y = df_preprocessed['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(2,3)
    )
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92      4252
           1       0.18      0.64      0.28       189

    accuracy                           0.86      4441
   macro avg       0.58      0.76      0.60      4441
weighted avg       0.95      0.86      0.90      4441



Using ngram(2,3) doesn't seem to bring better results

In [50]:
# All features
text_features = ['description_cleaned'] 
numerical_features = ['word_count', 'char_count', 'uppercase_ratio', 'digit_ratio', 'special_char_ratio', 'scam_words', 'n_uppercase_words']

X = df_preprocessed[text_features + numerical_features]
y = df_preprocessed['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1,2)
)
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('text', vectorizer, 'description_cleaned'),
        ('num', numerical_transformer, numerical_features)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced', random_state=42)) 
])
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.99      0.89      0.94      4252
           1       0.24      0.74      0.36       189

    accuracy                           0.89      4441
   macro avg       0.61      0.82      0.65      4441
weighted avg       0.96      0.89      0.91      4441



### Random Forest

In [52]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4252
           1       1.00      0.21      0.35       189

    accuracy                           0.97      4441
   macro avg       0.98      0.61      0.67      4441
weighted avg       0.97      0.97      0.96      4441



### XGB

In [53]:
# Simple xgboost model
xgb_model = XGBClassifier(
    scale_pos_weight=19,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4252
           1       0.67      0.48      0.56       189

    accuracy                           0.97      4441
   macro avg       0.82      0.73      0.77      4441
weighted avg       0.96      0.97      0.96      4441



In [56]:
# With SMOTE
xgb_model = XGBClassifier(
    scale_pos_weight=19,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', xgb_model)
])
model_pipeline.fit(X_train, y_train)

y_proba = model_pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.3).astype(int)
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.98      0.93      0.96      4252
           1       0.30      0.62      0.40       189

    accuracy                           0.92      4441
   macro avg       0.64      0.78      0.68      4441
weighted avg       0.95      0.92      0.93      4441



### LightGBM

In [54]:
# Without SMOTE
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    #('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(
        is_unbalance=True,
        random_state=42
    ))
])
model_pipeline.fit(X_train, y_train)

y_proba = model_pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.3).astype(int)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 442, number of negative: 9918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149716
[LightGBM] [Info] Number of data points in the train set: 10360, number of used features: 1007
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042664 -> initscore=-3.110797
[LightGBM] [Info] Start training from score -3.110797
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      4252
           1       0.39      0.58      0.46       189

    accuracy                           0.94      4441
   macro avg       0.68      0.77      0.72      4441
weighted avg       0.96      0.94      0.95      4441





In [55]:
# With SMOTE
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(
        is_unbalance=True,
        random_state=42
    ))
])
model_pipeline.fit(X_train, y_train)

y_proba = model_pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.3).astype(int)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 9918, number of negative: 9918
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062358 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 220761
[LightGBM] [Info] Number of data points in the train set: 19836, number of used features: 1007
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4252
           1       0.60      0.51      0.55       189

    accuracy                           0.96      4441
   macro avg       0.79      0.75      0.77      4441
weighted avg       0.96      0.96      0.96      4441





In [57]:
y_proba = model_pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.4).astype(int)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97      4252
           1       0.34      0.58      0.43       189

    accuracy                           0.93      4441
   macro avg       0.66      0.76      0.70      4441
weighted avg       0.95      0.93      0.94      4441



In [59]:
# SMOTE + RandomUnderSampler
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('undersample', RandomUnderSampler(sampling_strategy=0.8, random_state=42)), 
    ('smote', SMOTE(random_state=42)), 
    ('classifier', LGBMClassifier(is_unbalance=True, random_state=42, n_jobs=-1))
])
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 552, number of negative: 552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23088
[LightGBM] [Info] Number of data points in the train set: 1104, number of used features: 867
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.99      0.82      0.90      4252
           1       0.16      0.76      0.26       189

    accuracy                           0.82      4441
   macro avg       0.57      0.79      0.58      4441
weighted avg       0.95      0.82      0.87      4441





Not better with undersampler. Only keep SMOTE

- Best model so far -> LightGBM + SMOTE
- Will do random search to find best param

In [61]:
# All features
text_features = 'description_cleaned'
numerical_features = ['char_count', 'uppercase_ratio', 'digit_ratio', 'special_char_ratio', 'scam_words', 'n_uppercase_words']
target = 'fraudulent'

X = df_preprocessed[[text_features] + numerical_features]
y = df_preprocessed[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

text_transformer = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1,2), min_df=5)
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)), 
    ('classifier', LGBMClassifier(is_unbalance=True, random_state=42, n_jobs=-1))
])

grid = {
    'classifier__num_leaves': [15, 50, 100],
    'classifier__max_depth': [-1, 10],
    'classifier__learning_rate': [0.01, 0.2, 0.5],
    'classifier__n_estimators': [100, 500, 1000],
    'classifier__min_child_samples': [10, 30, 50],
    'classifier__subsample': [0.6, 1.0, 1.5],
    'classifier__colsample_bytree': [0.3, 0.6, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=grid,
    n_iter=30,
    scoring='f1',
    n_jobs=-1,
    cv=cv,
    verbose=2,
    random_state=42
)
search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=1.5; total time=   2.8s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.223385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 283334
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 2006
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.244317 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=1.5; total time=   3.5s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.332353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281910
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 2006
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.284791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285616
[LightGBM] [Info] Number of data points in the train set: 15870

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=1.5; total time=   7.3s
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=1.5; total time=   6.9s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   6.8s



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   8.6s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   8.6s



[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .





[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   9.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .




















[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=  10.6s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[LightGBM] [Info] Number of positive: 7934, number of negative: 7934

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.402518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 283250
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.569068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285017
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1995
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000



































[LightGBM] [Info]









[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=0.6; total time=  59.4s
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=0.6; total time=  59.4s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=0.6; total time=  59.8s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=100, classifier__subsample=0.6; total time=  59.9s
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.609547 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282708
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1993
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-th



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time= 1.1min




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time= 1.1min




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time= 1.1min
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.440004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 282483
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1978
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.468722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.5; total time=  13.2s




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  55.1s




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  53.0s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  53.1s
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  53.1s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  53.9s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.5; total time=   5.2s
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.5; total time=   4.5s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.3s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.3s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.5s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.3s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=1.5; total time=   3.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=1.5; total time=   3.3s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=1.5; total time=   3.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=1.5; total time=   3.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=1.5; total time=   3.2s
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.387921 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 283035
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1981
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.335691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tot



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  54.8s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  55.5s





[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 284831
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1982
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.533188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281618
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1982
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000








































[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.1min

[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.0; total time= 1.2min




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.0; total time= 1.2min







[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.0; total time= 1.2min








[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.0; total time= 1.2min
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=15, classifier__subsample=1.0; total time= 1.3min




[LightGBM] [Info] Number of positive: 7935, number of negative: 7935


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.284549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 282483
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1978
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.337093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285321
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1981
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.3min

[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.2min






[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   8.6s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.418534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285616
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 2006
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=  10.4s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .








[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  55.3s
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  55.3s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   8.5s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.0min




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  56.4s








[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.0min


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   5.0s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   4.4s




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  57.4s
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.5s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.3s
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.3s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.3s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.5; total time=   3.2s
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.282849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 283334
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 2006
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Number of positive: 7935, number 



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  14.7s




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  15.1s
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  15.0s




[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  15.1s
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  14.9s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  15.9s
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.458281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281618
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1982
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  17.3s
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.632400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282483
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1978
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.515426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=0.6; total time=  51.2s
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.451799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281828
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1997
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  13.3s




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  12.7s
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=0.6; total time=  12.7s
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.469881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 282708
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1993
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [In







[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time=  41.9s
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time=  42.8s




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=0.6; total time=  33.5s




[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=0.6; total time=  34.8s




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.414281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 282818
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 2005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=0.6; total time=  35.1s
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.487733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time=  44.6s
[LightGBM] [Info] Total Bins 285616
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 2006
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.495116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 283250
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.5, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time=  43.8s




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.594836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285017
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1995
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.526186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281828
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1997
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negat

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=  12.1s




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=0.6; total time=  29.5s




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=0.6; total time=  32.7s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   8.0s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   5.9s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   5.9s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   6.9s
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.572067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 283035
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1981
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.512355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bi



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.3min




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.4min
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.4min




[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.530204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282483
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1978
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.555458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285321
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1981




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 2.6min
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 2.6min




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 2.6min




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 2.6min
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.441546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 284831
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1982
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.524307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total B



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 2.8min




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.2min
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.569858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 283250
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=50, classifier__subsample=1.0; total time= 1.2min
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.457900 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285017
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1995
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.593071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time=  54.4s

[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.587028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282708
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1993
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time=  54.9s








[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time=  54.5s
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.466059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285508
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1994
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.527365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 283334
[LightGBM] [Info] Number of data points in the train s



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 2.3min


[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.625854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285106
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 2005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time=  52.2s

[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time=  51.8s




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.611036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281910
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 2006
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 2.3min




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 2.4min
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.457922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 282818
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 2005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 2.4min




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.591144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285616
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 2006
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.01, classifier__max_depth=-1, classifier__min_child_samples=50, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 2.4min




[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   7.2s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   6.7s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .





[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   5.8s
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   5.8s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.5; total time=   5.9s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.433480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285017
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.416885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1995
[LightGBM] [Info] Total Bins 283250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=0.6, classifier_



[LightGBM] [Info] Number of positive: 7934, number of negative: 7934

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.658867 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281828
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1997
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
You can set `force_col_wise=true` to remove the overhead.

[LightGBM] [Info] Total Bins 282708
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1993
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimato







[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=0.6; total time=  23.2s






[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=0.6; total time=  23.5s








[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.525390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285508
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1994
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.519340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] T



[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.626124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285017
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1995
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=0.6; total time=  28.0s






[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.640121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281828
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1997
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=50, classifier__subsample=0.6; total time=  24.5s




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.645530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 282708
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1993
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 1.4min











[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 1.3min
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.573874 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285508
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1994
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.649094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug



[CV] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=10, classifier__n_estimators=1000, classifier__num_leaves=50, classifier__subsample=0.6; total time= 1.3min
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.555667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285106
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 2005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.531738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 281910
[LightGBM] [Info] Number of data points in the train s



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.0; total time=  33.5s




[LightGBM] [Info] Number of positive: 7935, number of negative: 7935




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.0; total time=  32.6s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.510531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282818
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 2005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.581034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285



[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.0; total time=  33.5s
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.0; total time=  24.7s




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 1.6min








[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 1.6min
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=10, classifier__n_estimators=500, classifier__num_leaves=15, classifier__subsample=1.0; total time=  31.0s
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.519429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285017
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1995
[LightGBM] [Info] [binary:Boo



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.551484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285508
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1994
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.515559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 283250
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, t




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.0; total time=  19.3s








[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.0; total time=  18.6s
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=500, classifier__num_leaves=100, classifier__subsample=0.6; total time= 1.7min
[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.602210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285017
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1995
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifie



[LightGBM] [Info] Number of positive: 7934, number of negative: 7934
[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.560901 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281828
[LightGBM] [Info] Number of data points in the train set: 15868, number of used features: 1997
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   3.4s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[LightGBM] [Info] Number of positive: 7935, number of negative: 7935
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.0; total time=  18.3s




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.582718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282708
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1993
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.542171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285508
[LightGBM] [Info] Number of data points in the train set: 15870, number of used features: 1994
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.2, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.0; total time=  18.8s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   3.7s
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.0; total time=  13.9s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   3.3s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .



[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   3.3s




[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.0; total time=   9.9s


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

50 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/work/miniconda3/envs/datawizard/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/work/miniconda3/envs/datawizard/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/work/miniconda3/envs/datawizard/lib/python3.10/site-pack

[CV] END classifier__colsample_bytree=1.0, classifier__learning_rate=0.2, classifier__max_depth=-1, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=100, classifier__subsample=1.5; total time=   2.6s
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.0; total time=   8.6s
[CV] END classifier__colsample_bytree=0.3, classifier__learning_rate=0.5, classifier__max_depth=10, classifier__min_child_samples=30, classifier__n_estimators=100, classifier__num_leaves=15, classifier__subsample=1.0; total time=   8.4s
[LightGBM] [Info] Number of positive: 9918, number of negative: 9918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 316001
[LightGBM] [Info] Num

In [62]:
y_proba = search.best_estimator_.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.4).astype(int)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      4252
           1       0.89      0.41      0.56       189

    accuracy                           0.97      4441
   macro avg       0.93      0.71      0.77      4441
weighted avg       0.97      0.97      0.97      4441





In [63]:
search.best_params_

{'classifier__subsample': 0.6,
 'classifier__num_leaves': 50,
 'classifier__n_estimators': 1000,
 'classifier__min_child_samples': 10,
 'classifier__max_depth': -1,
 'classifier__learning_rate': 0.2,
 'classifier__colsample_bytree': 0.6}

In [67]:
best_model = search.best_estimator_
joblib.dump(best_model, '../models/lgbm_model.pkl')

['../models/lgbm_model.pkl']

In [68]:
# Test lower threshold
y_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.3).astype(int)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      4252
           1       0.87      0.42      0.57       189

    accuracy                           0.97      4441
   macro avg       0.92      0.71      0.78      4441
weighted avg       0.97      0.97      0.97      4441





- Tried glove but didn't reach better results.
- Also tried undersampling (of most frequent class) but no better results.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Features
text_features = 'description_cleaned'
numerical_features = ['char_count', 'uppercase_ratio', 'digit_ratio', 'special_char_ratio', 'scam_words', 'n_uppercase_words']
target = 'fraudulent'

# Prepare data
X = df_preprocessed[[text_features] + numerical_features]
y = df_preprocessed[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)