In [1]:
import os
import pandas as pd
import gc
import lightgbm as lgb
import datetime

PROJECT_ROOT = os.path.join(os.getcwd(), '..')
DATA_DIR = os.path.join(PROJECT_ROOT,'data')
MODEL_PATH = os.path.join(PROJECT_ROOT,'model')

In [2]:
def train_and_predict(merge, X_pred):
    X = merge.drop(['click_time', 'attributed_time', 'is_attributed'], axis=1).values
    y = merge['is_attributed'].values
    categorical_features = ['ip','app','os','channel','device', 'day']
    predictors = list(set(merge.columns) - set(['attributed_time', 'click_time', 'is_attributed']))

    lgbtrain = lgb.Dataset(X, label=y,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric':'auc',     

        'learning_rate': 0.15,
        'num_leaves': 7,  # 2^max_depth - 1
        'max_depth': 3,  # -1 means no limit
        'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        # 'scale_pos_weight':99
    }

    evals_results = {}
    num_boost_round = 250
    early_stopping_rounds = 30

    booster = lgb.train(
         lgb_params, 
         lgbtrain, 
         valid_sets=[lgbtrain], 
         valid_names=['train'], 
         evals_result=evals_results, 
         num_boost_round=num_boost_round,
         early_stopping_rounds=early_stopping_rounds,
         verbose_eval=1
    )
    
    # predict test data
    y_prob = booster.predict(X_pred)
    return y_prob

In [3]:
def preprocess(df):
    df['day'] = df.click_time.dt.day
    df['hour'] = df.click_time.dt.hour
    df['minute'] = df.click_time.dt.minute
    df['second'] = df.click_time.dt.second
    
    return df

In [4]:
def get_now():
    now = datetime.datetime.now()
    return '{0:%Y-%m-%d %H:%M:%S}'.format(now)

In [5]:
def predict(booster):
    
    reader = pd.read_csv(os.path.join(DATA_DIR,'test.csv'), parse_dates=['click_time'], chunksize=100000)
    for i, test in enumerate(reader):
        print('[{}]Start:Preprocessing Data:Size:{}'.format(get_now(), len(test)))
        test = preprocess(test)
        print('[{}]Finish:Preprocessing Data:Size:{}'.format(get_now(), len(test)))
        
        print('[{}]Start:Predicting Data'.format(get_now()))
        X = test.drop(['click_id','click_time'], axis=1)
        y_prob = booster.predict(X.values)
        #y_class_one = [i[1] for i in y_prob]
        print('[{}]Finish:Predicting Data'.format(get_now()))
       
        print('[{}]Start:output Data'.format(get_now()))
        y = pd.DataFrame({
                                        'click_id' : test['click_id'],
                                        'is_attributed' : y_prob
                                        })
        
        output = os.path.join(DATA_DIR, 'submission_lgb.csv')
        if i == 0:
            if os.path.isfile(output):
                os.remove(output)
            header = True
        else:
            header = False
        y.to_csv(output, index=False, header=header, mode='a')
        print('[{}]Finish:output Data'.format(get_now()))
    
    print('[{}]Finish:All Process'.format(get_now()))

In [None]:
print('[{}]Start:read positive'.format(get_now()))
positive = pd.read_csv(os.path.join(DATA_DIR, 'train_positive.csv'), parse_dates=['click_time'])
print('[{}]Start:read negative'.format(get_now()))
negative = pd.read_csv(os.path.join(DATA_DIR, 'train_negative.csv'), parse_dates=['click_time'])
print('[{}]Start:read test'.format(get_now()))
test = pd.read_csv(os.path.join(DATA_DIR,'test.csv'), parse_dates=['click_time'])
test = preprocess(test)
X = test.drop(['click_id','click_time'], axis=1)
click_id = test['click_id']
del test
gc.collect()
X_pred = X.values

print('[{}]Finished:All data preparing'.format(get_now()))

y_probs = []
for i in range(200):
    print('[{}]Start: process:{}'.format(get_now(), i))
    negative_sampled = negative.sample(500000)
    merge = pd.concat([positive, negative_sampled])
    merge = preprocess(merge)
    print('[{}]Start: process:{}:training'.format(get_now(), i))
    y_probs.append(train_and_predict(merge, X_pred))
    print('[{}]Finished: process:{}:training'.format(get_now(), i))

del negative
gc.collect()

[2018-04-24 01:31:48]Start:read positive
[2018-04-24 01:31:49]Start:read negative
