In [None]:
import numpy as np
import pandas as pd
import pickle
import os
import lightgbm as lgbm
from sklearn.metrics import roc_auc_score
from tools import make_submission

In [None]:
DATA_PATH = "../data/data_smart.pkl"
SUB_FILE = "lgbm_rf_smart.csv"

TEST_PATH = "../data/UnlabeledWiDS2021.csv"
SUBMISSION_PATH = "../submissions"

## Loading data

In [None]:
with open(DATA_PATH, "rb") as f:
    data = pickle.load(f)

x_train_val = data["x_train_val"]
y_train_val = data["y_train_val"]
x_test = data["x_test"]

x_train = data["x_train"]
y_train = data["y_train"]
x_val = data["x_val"]
y_val = data["y_val"]

In [None]:
eval_set = [(x_val, y_val)]

In [None]:
lgbm_train_val = lgbm.Dataset(data=x_train_val, label=y_train_val)

lgbm_train = lgbm.Dataset(data=x_train, label=y_train)
lgbm_val = lgbm.Dataset(data=x_val, label=y_val)

## Training in train/validation data

In [None]:
model = lgbm.LGBMClassifier(
    boosting_type="rf",
    num_leaves=50,
    n_estimators=1000,
    max_depth=20,
    subsample=0.1,
    reg_alpha=0.54,
    reg_lambda=0.4,
    min_split_gain=0.7,
    min_child_weight=40,
    objective="binary",
    is_unbalance=True,
    bagging_freq=1,
    random_state=1337)

model.fit(x_train, y_train, eval_set=eval_set, eval_metric="auc", early_stopping_rounds=100, verbose=100)

In [None]:
feature_importance = pd.DataFrame({'feature':x_train.columns, 'score': model.feature_importances_}).sort_values("score", ascending=False).reset_index(drop=True)
print(feature_importance.shape)
feature_importance.head(10)

In [None]:
feature_importance.tail(10)

## Training with all data

In [None]:
model = lgbm.LGBMClassifier(
    boosting_type="rf",
    num_leaves=50,
    n_estimators=250,
    max_depth=20,
    subsample=0.1,
    reg_alpha=0.54,
    reg_lambda=0.4,
    min_split_gain=0.7,
    min_child_weight=40,
    objective="binary",
    is_unbalance=True,
    bagging_freq=1,
    random_state=1337)

model.fit(x_train_val, y_train_val, eval_metric="auc")
test_pred = model.predict_proba(x_test)[:,1]

In [None]:
sub_name = os.path.join(SUBMISSION_PATH, SUB_FILE)
make_submission(test_pred, TEST_PATH, sub_name)