In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold , cross_val_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score,recall_score
%matplotlib inline
import numpy as np
import pickle
import re
import json
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
import xgboost as xgb

In [None]:
df=pd.read_parquet('raw_train.parquet')
print(df.head())

In [3]:
import pandas as pd


categorical_features = ['feature3', 'feature5','feature6','feature8','feature9','feature10','feature14','feature15','feature16']

# Convert real number values to strings in categorical features
df[categorical_features] = df[categorical_features].astype(str)
print(df.dtypes)

feature1     object
feature2     object
feature3     object
feature4      int64
feature5     object
feature6     object
feature7      int64
feature8     object
feature9     object
feature10    object
feature11     int64
feature12     int64
feature13     int64
label         int64
feature14    object
feature15    object
feature16    object
dtype: object


In [None]:
label_counts = df['label'].value_counts()
print(label_counts)
# Plot biểu đồ cột
label_counts.plot(kind='bar')
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Label Count')
plt.show()

In [None]:
with open('features_config.json', 'r') as f:
    features_config = json.load(f)
features_config

In [None]:
X=df.drop('label',axis=1)
# for col in X.columns:
#     col_type = X[col].dtype
#     if col_type == 'object' or col_type.name == 'category':
#         X[col] = X[col].astype('category')
# X.columns = [re.sub(r'\W+', '', col) for col in X.columns]
Y=df['label']
print(X.dtypes)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

# X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

In [132]:
label_counts = Y_train.value_counts()
print(label_counts)

0    113145
1      7635
Name: label, dtype: int64


In [133]:
label_counts_test = Y_test.value_counts()
print(label_counts_test)

0    12559
1      862
Name: label, dtype: int64


In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(random_state=0,scale_pos_weight=3)

xgb_model.fit(X_train, Y_train)


Y_pred_xgb = xgb_model.predict(X_test)
print("auc_score",roc_auc_score(Y_test,Y_pred_xgb))
print("f1_score",f1_score(Y_pred_xgb,Y_test))
print("recall_score",recall_score(Y_pred_xgb,Y_test))

auc_score 0.9787655629696972
f1_score 0.9583333333333333
recall_score 0.9561200923787528


In [None]:
filename = 'prob_1.ckpt'
pickle.dump(xgb_model, open(filename, 'wb'))

In [6]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [7]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    # random_seed=42,
    # learning_rate=0.1,
    # custom_loss=['AUC'],
    # scale_pos_weight=5
)

clf.fit(
    X_train, Y_train,
    cat_features=cat_features,
    eval_set=(X_test, Y_test),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f7be6766a70>

In [9]:
Y_pred_cb = clf.predict(X_test)
print("auc_score",roc_auc_score(Y_test,Y_pred_cb))
print("f1 socre", f1_score(Y_pred_cb,Y_test))
print("recall",recall_score(Y_pred_cb,Y_test))

auc_score 0.9576621548148886
f1 socre 0.9405469678953626
recall 0.9646341463414634


In [None]:
from lightgbm import LGBMClassifier
lgbmc_model=LGBMClassifier(random_state=0,scale_pos_weight=3)
lgbmc_model.fit(X_train, Y_train,categorical_feature = 'auto',eval_set=(X_val, Y_val),feature_name='auto', verbose=0)



Y_pred_lgbmc = lgbmc_model.predict(X_test)
print("auc_score",roc_auc_score(Y_test,Y_pred_lgbmc))
print('f1_score',f1_score(Y_test,Y_pred_lgbmc))
print("recall",recall_score(Y_test,Y_pred_lgbmc))

In [None]:
filename = 'prob_1.ckpt'
pickle.dump(lgbmc_model, open(filename, 'wb'))

In [138]:
from sklearn.base import BaseEstimator, TransformerMixin
class DropColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns = []):
        self.columns = columns
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Perform arbitary transformation
        X = X.drop(columns=self.columns)
        return X

In [139]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler()),
           ('pca', PCA())
           ]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        # ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, features_config['numeric_columns']),
        ("cat", categorical_transformer, features_config['category_columns']),
    ]
)

In [140]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", LGBMClassifier(random_state=0,scale_pos_weight=10))]
)

In [149]:

from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__max_depth': [2, 3, 5, 7],
    'model__n_estimators': [10, 25, 50],       
}
grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='roc_auc')
grid_search.fit(X_train, Y_train)
best_model = grid_search.best_estimator_
Y_pred_lgbmc = best_model.predict(X_test)
print("auc_score",roc_auc_score(Y_test,Y_pred_lgbmc))
print('f1_score',f1_score(Y_test,Y_pred_lgbmc))
print("recall",recall_score(Y_test,Y_pred_lgbmc))

auc_score 0.9690005170952732
f1_score 0.8623757195185766
recall 0.9559164733178654


In [151]:
clf.fit(X_train, Y_train)
print(clf.get_params())
Y_pred_lgbmc = clf.predict(X_test)
print("auc_score",roc_auc_score(Y_test,Y_pred_lgbmc))
print('f1_score',f1_score(Y_test,Y_pred_lgbmc))
print("recall",recall_score(Y_test,Y_pred_lgbmc))

{'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler()),
                                                 ('pca', PCA())]),
                                 ['feature3', 'feature4', 'feature5',
                                  'feature6', 'feature7', 'feature8',
                                  'feature9', 'feature10', 'feature11',
                                  'feature12', 'feature13', 'feature14',
                                  'feature15', 'feature16']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['feature2', 'feature1'])])), ('model', LGBMClassi

In [126]:
filename = 'prob_1.ckpt'
pickle.dump(clf, open(filename, 'wb'))