In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings('ignore')
import gc
from scipy import sparse
import datatable as dt
from catboost import CatBoostClassifier, Pool
import optuna
import sklearn.datasets
import sklearn.metrics

In [2]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

In [3]:
%%time
print('读取数据...')
train = pd.read_feather('../input/janestreettrainfeather/train.feather')
print('读取结束')
print('Data preprocessing...')

train = train.query('date > 85').reset_index(drop = True)   # 只保留第86天及以后的data
train = train[train['weight'] > 0]
train['action'] =  (train['resp'] > 0 ).astype('int')
features = []
for item in train.columns:
#    if 'feature' in item or 'weight' in item:
    if 'feature' in item:
        features.append(item)
print(features)
features_mean = train.loc[:, features].mean()
train.fillna(train.mean(),inplace=True) 
reduce_mem_usage(train)
print('Done!')
VALID_DAYS = 50  # using for valid
#resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']
df_train = train[train['date'] <= 499-VALID_DAYS]
df_test = train[train['date'] > 499-VALID_DAYS]

读取数据...
读取结束
Data preprocessing...
['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature

In [4]:
def objective(trial):
    x_train = df_train[features]
    x_val = df_test[features]
    y_train = df_train['action']
    y_val = df_test['action']
    dtrain = lgb.Dataset(x_train, label=y_train)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(x_val)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(y_val, pred_labels)
    return accuracy

In [5]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2021-02-04 01:45:55,379][0m A new study created in memory with name: no-name-740db11a-3237-43e1-b304-7f320fbdddea[0m
[32m[I 2021-02-04 01:46:49,199][0m Trial 0 finished with value: 0.5205459721422036 and parameters: {'lambda_l1': 2.0486262267193102e-07, 'lambda_l2': 1.0428604606646677e-05, 'num_leaves': 228, 'feature_fraction': 0.8439041586800065, 'bagging_fraction': 0.6906701287512585, 'bagging_freq': 6, 'min_child_samples': 83}. Best is trial 0 with value: 0.5205459721422036.[0m
[32m[I 2021-02-04 01:47:27,821][0m Trial 1 finished with value: 0.523583386992917 and parameters: {'lambda_l1': 6.054859218175769, 'lambda_l2': 0.004316957685744817, 'num_leaves': 127, 'feature_fraction': 0.40972157720093133, 'bagging_fraction': 0.7644731999899101, 'bagging_freq': 7, 'min_child_samples': 55}. Best is trial 1 with value: 0.523583386992917.[0m
[32m[I 2021-02-04 01:48:09,155][0m Trial 2 finished with value: 0.5212788490866574 and parameters: {'lambda_l1': 0.07527833495421682, '

Number of finished trials: 10
Best trial:
  Value: 0.523583386992917
  Params: 
    lambda_l1: 6.054859218175769
    lambda_l2: 0.004316957685744817
    num_leaves: 127
    feature_fraction: 0.40972157720093133
    bagging_fraction: 0.7644731999899101
    bagging_freq: 7
    min_child_samples: 55


In [6]:
%%time
# 划分数据集
print('划分数据集...')
x_train = df_train[features]
x_val = df_test[features]
y_train = df_train['action']
y_val = df_test['action']
print('开始训练gbdt..')
gbm = lgb.LGBMRegressor(objective='binary',
                        lambda_l1=0.0037465811711039068,
                        lambda_l2=2.527392199457406,
                        num_leaves=61,
                        feature_fraction=0.5970610375228046,
                        bagging_fraction=0.5916401743618064,
                        bagging_freq=3,
                        min_child_samples=29,
                        learning_rate=0.01,
                        n_estimators=1000,
                        random_state=42,
                         #device='gpu'
                        )

model=gbm.fit(x_train, y_train,
            eval_set = [(x_train, y_train), (x_val, y_val)],
            eval_names = ['train', 'val'],
            eval_metric = 'auc',
            early_stopping_rounds = 100,
            verbose=50,
            )


划分数据集...
开始训练gbdt..
Training until validation scores don't improve for 100 rounds
[50]	train's auc: 0.550543	train's binary_logloss: 0.690948	val's auc: 0.526328	val's binary_logloss: 0.692347
[100]	train's auc: 0.556969	train's binary_logloss: 0.689332	val's auc: 0.529251	val's binary_logloss: 0.691841
[150]	train's auc: 0.563228	train's binary_logloss: 0.687963	val's auc: 0.531146	val's binary_logloss: 0.691516
[200]	train's auc: 0.56987	train's binary_logloss: 0.686698	val's auc: 0.533697	val's binary_logloss: 0.69118
[250]	train's auc: 0.575722	train's binary_logloss: 0.685668	val's auc: 0.534513	val's binary_logloss: 0.691008
[300]	train's auc: 0.580532	train's binary_logloss: 0.684594	val's auc: 0.534969	val's binary_logloss: 0.690876
[350]	train's auc: 0.585198	train's binary_logloss: 0.683667	val's auc: 0.535321	val's binary_logloss: 0.690788
[400]	train's auc: 0.590052	train's binary_logloss: 0.682698	val's auc: 0.536002	val's binary_logloss: 0.690694
[450]	train's auc: 0.5942

In [7]:
%%time
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set
for (test_df, sample_prediction_df) in iter_test:
    wt = test_df.iloc[0].weight
    if(wt == 0):
        sample_prediction_df.action = 0 
    else:
        X_test = test_df.loc[:, features].values
        action = model.predict(X_test)
        if (action > 0.5):
            sample_prediction_df.action = 1
        else:
            sample_prediction_df.action = 0  
    env.predict(sample_prediction_df)

CPU times: user 12min 35s, sys: 961 ms, total: 12min 36s
Wall time: 3min 39s
