In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, CategoricalNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as M
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
import pickle

In [2]:
pred_metrics = {
    'precision': M.precision_score,
    'auc': M.roc_auc_score,
    'accuracy': M.accuracy_score,
    'recall': M.recall_score,
    'f1_score': M.f1_score,
    'logloss': M.log_loss,
    'mse': M.mean_squared_error,
    'mae': M.mean_absolute_error
    }

In [3]:
task = 'click'
data_pth = f'/root/data/xingmei/Sharechat-RecSys-Challenge-23/data/{task}_trn_val.csv'
df = pd.read_csv(data_pth, sep='\t')

In [4]:
df.sample(5)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked
2697944,55,20095,563,31686,25604,590,27941,19203,23218,19343,...,0.0,2.284486,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0
2021894,63,25976,22294,7696,21545,17507,27941,21218,869,19343,...,0.0,0.571121,0.571121,0.115692,1.156922,0.269948,0.0,0.0,0.0,0
59880,48,20095,563,31686,21545,590,27941,21218,23218,22970,...,0.0,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0
2325899,58,26325,22294,4896,19072,26293,27941,21218,31372,19343,...,0.0,0.571121,0.571121,0.077128,1.156922,0.269948,0.0,0.0,0.0,0
2581078,56,11077,7152,18575,21545,20137,27941,21621,869,19343,...,0.0,0.0,0.0,0.115692,1.156922,0.269948,0.0,0.0,0.0,0


In [5]:
for i in ['f_30', 'f_31']:
    df[i] = df[i].fillna(df[i].mode().iloc[0])
for i in ['f_43', 'f_51', 'f_58', 'f_59', 'f_64', 'f_65', 'f_66', 'f_67', 'f_68', 'f_69', 'f_70']:
    df[i] = df[i].fillna(df[i].mean())

In [6]:
X_train = df.iloc[:1016364, :-1].to_numpy()
y_train = df.iloc[:1016364, -1].to_numpy()
X_test = df.iloc[1016364:, :-1].to_numpy()
y_test = df.iloc[1016364:, -1].to_numpy()

In [8]:
y_test.mean()

0.2216078798520179

In [6]:
MODELS = [
    # LogisticRegression(random_state=2023, multi_class='ovr'),
    # MLPClassifier(max_iter=100, early_stopping=True, random_state=2022, hidden_layer_sizes=(20,20,20)),
    # KNeighborsClassifier(n_neighbors=3),
    # CategoricalNB(),
    GaussianNB(),                               # All as continuous features
    DecisionTreeClassifier(random_state=2023),  # All as continuous features
    XGBClassifier(n_estimators=100, max_depth=5, learning=0.1, silent=False, objective='binary:logitraw')
]

## Make use of all features

In [7]:
for model in MODELS:
    model.fit(X_train, y_train)
    y_hat_prob = model.predict_proba(np.array(X_test))[:, 1]
    y_pred = model.predict(np.array(X_test))
    results = {}
    for k, f in pred_metrics.items():
        if k != 'auc':
            results[k] = round(f(y_test, y_pred), 2)
        else:
            results[k] = round(f(y_test, y_hat_prob), 2)
    print(str(type(model)) + '\n' + str(results).strip('{}'))

<class 'sklearn.naive_bayes.GaussianNB'>
'precision': 0.26, 'auc': 0.6, 'accuracy': 0.78, 'recall': 0.01, 'f1_score': 0.01, 'logloss': 8.08, 'mse': 0.22, 'mae': 0.22
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
'precision': 0.51, 'auc': 0.7, 'accuracy': 0.78, 'recall': 0.55, 'f1_score': 0.53, 'logloss': 7.83, 'mse': 0.22, 'mae': 0.22
Parameters: { "learning", "silent" } are not used.

<class 'xgboost.sklearn.XGBClassifier'>
'precision': 0.94, 'auc': 0.85, 'accuracy': 0.87, 'recall': 0.44, 'f1_score': 0.6, 'logloss': 4.73, 'mse': 0.13, 'mae': 0.13


## Make use of categorical features

In [12]:
cat_X_train = X_train[:, :41]   # Date(f_1), Categorical features(f_2 to f_32), Binary features(f_33 to f_41)
cat_X_test = X_test[:, :41]

print('Only use categorical features')
for model in MODELS:
    model.fit(cat_X_train, y_train)
    y_hat_prob = model.predict_proba(np.array(cat_X_test))[:, 1]
    y_pred = model.predict(np.array(cat_X_test))
    results = {}
    for k, f in pred_metrics.items():
        if k != 'auc':
            results[k] = round(f(y_test, y_pred), 2)
        else:
            results[k] = round(f(y_test, y_hat_prob), 2)
    print(str(type(model)) + '\n' + str(results).strip('{}'))

Only use categorical features
<class 'sklearn.naive_bayes.GaussianNB'>
'precision': 1.0, 'auc': 0.64, 'accuracy': 0.78, 'recall': 0.0, 'f1_score': 0.0, 'logloss': 7.98, 'mse': 0.22, 'mae': 0.22
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
'precision': 0.51, 'auc': 0.7, 'accuracy': 0.78, 'recall': 0.56, 'f1_score': 0.53, 'logloss': 7.86, 'mse': 0.22, 'mae': 0.22
Parameters: { "learning", "silent" } are not used.

<class 'xgboost.sklearn.XGBClassifier'>
'precision': 0.94, 'auc': 0.85, 'accuracy': 0.87, 'recall': 0.44, 'f1_score': 0.6, 'logloss': 4.72, 'mse': 0.13, 'mae': 0.13


## Make use of numerical features

In [14]:
con_X_train = X_train[:, 41:]   # Numerical features(f_42 to f_79)
con_X_test = X_test[:, 41:]

print('Only use numerical features')
for model in MODELS:
    model.fit(con_X_train, y_train)
    y_hat_prob = model.predict_proba(np.array(con_X_test))[:, 1]
    y_pred = model.predict(np.array(con_X_test))
    results = {}
    for k, f in pred_metrics.items():
        if k != 'auc':
            results[k] = round(f(y_test, y_pred), 2)
        else:
            results[k] = round(f(y_test, y_hat_prob), 2)
    print(str(type(model)) + '\n' + str(results).strip('{}'))

Only use numerical features
<class 'sklearn.naive_bayes.GaussianNB'>
'precision': 0.26, 'auc': 0.58, 'accuracy': 0.78, 'recall': 0.01, 'f1_score': 0.01, 'logloss': 8.08, 'mse': 0.22, 'mae': 0.22
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
'precision': 0.44, 'auc': 0.64, 'accuracy': 0.75, 'recall': 0.45, 'f1_score': 0.45, 'logloss': 8.98, 'mse': 0.25, 'mae': 0.25
Parameters: { "learning", "silent" } are not used.

<class 'xgboost.sklearn.XGBClassifier'>
'precision': 0.91, 'auc': 0.8, 'accuracy': 0.83, 'recall': 0.28, 'f1_score': 0.43, 'logloss': 5.97, 'mse': 0.17, 'mae': 0.17


## Bayes Methods for Categorical features

In [9]:
BAYES_MODELS = [
    BernoulliNB(),
    CategoricalNB()
]
cat_X_train = X_train[:, :41]   # Date(f_1), Categorical features(f_2 to f_32), Binary features(f_33 to f_41)
cat_X_test = X_test[:, :41]
for i, col in zip(range(cat_X_test.shape[1]), df.columns):
    tst_ooc = not set(np.unique(cat_X_test[:, i])).issubset(set(np.unique(cat_X_train[:, i])))
    if tst_ooc:
        cat_X_train[:, i] = 0
        cat_X_test[:, i] = 0

print('Only use categorical features')
for model in BAYES_MODELS:
    model.fit(cat_X_train, y_train)
    # y_hat_prob = model.predict_proba(np.array(cat_X_test))[:, 1]
    y_pred = model.predict(np.array(cat_X_test))
    results = {}
    for k, f in pred_metrics.items():
        if k != 'auc':
            results[k] = round(f(y_test, y_pred), 2)
        else:
            pass
            # results[k] = round(f(y_test, y_hat_prob), 2)
    print(str(type(model)) + '\n' + str(results).strip('{}'))

Only use categorical features
<class 'sklearn.naive_bayes.BernoulliNB'>
'precision': 0.45, 'accuracy': 0.78, 'recall': 0.02, 'f1_score': 0.04, 'logloss': 8.03, 'mse': 0.22, 'mae': 0.22
<class 'sklearn.naive_bayes.CategoricalNB'>
'precision': 0.74, 'accuracy': 0.81, 'recall': 0.2, 'f1_score': 0.32, 'logloss': 6.95, 'mse': 0.19, 'mae': 0.19


: 