In [1]:
import pandas as pd
import lightgbm as lgb
import os
import gc
import pickle


from sklearn import preprocessing, model_selection

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.preprocessing import LabelEncoder,StandardScaler

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
#"""
def get_type_feature_all_(sample, train_df, key, on, type_c, mark):
    if type_c == "count":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].count()).reset_index()
    if type_c == "mean":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].mean()).reset_index()
    if type_c == "nunique":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].nunique()).reset_index()
    if type_c == "max":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].max()).reset_index()
    if type_c == "min":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].min()).reset_index()
    if type_c == "sum":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].sum()).reset_index()
    if type_c == "std":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].std()).reset_index()
    if type_c == "median":
        tmp = pd.DataFrame(train_df[key + [on]].groupby(key)[on].median()).reset_index()
    tmp.columns = key + [mark + "_" + "_".join(key) + '_%s_' % type_c + on]
    tmp[mark + "_" + "_".join(key) + '_%s_' % type_c + on] = tmp[
        mark + "_" + "_".join(key) + '_%s_' % type_c + on].astype('float32')
    sample = sample.merge(tmp, on=key, how='left')
    del tmp, train_df
    gc.collect()
    return sample, mark + "_" + "_".join(key) + '_%s_' % type_c + on
#"""
def get_type_feature_all(sample,train_df,key,on,type_c,mark):
    pickle_path="../pickle/"
    filename = "_".join([mark+"_%s_features"%type_c, "_".join(key), on, str(len(sample))]) + ".pkl"
    try:
        with open(pickle_path + filename, "rb") as fp:
            print("load {} {} feature from pickle file: key: {}, on: {}...".format(mark,type_c,"_".join(key), on))
            col = pickle.load(fp)
        for c in col.columns:
            sample[c] = col[c]
        gc.collect()
    except:
        print('get {} {} feature, key: {}, on: {}'.format(mark,type_c,"_".join(key), on))
        if type_c=="count":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].count()).reset_index()
        if type_c=="mean":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].mean()).reset_index()
        if type_c=="nunique":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].nunique()).reset_index()
        if type_c=="max":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].max()).reset_index()
        if type_c=="min":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].min()).reset_index()
        if type_c=="sum":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].sum()).reset_index()
        if type_c=="std":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].std()).reset_index()
        if type_c=="median":
            tmp = pd.DataFrame(train_df[key+[on]].groupby(key)[on].median()).reset_index()
        tmp.columns = key+[mark+"_"+"_".join(key) + '_%s_'%type_c + on]
        tmp[mark+"_"+"_".join(key) + '_%s_'%type_c + on] = tmp[mark+"_"+"_".join(key) + '_%s_'%type_c + on].astype('float32')
        sample = sample.merge(tmp, on=key, how='left')
        with open(pickle_path + filename, "wb") as fp:
            col = sample[[mark+"_"+"_".join(key) + '_%s_'%type_c + on]].copy()
            pickle.dump(col, fp)
        del tmp
    #if save_only:
        #for i in col.columns:
            #del sample[i]
    del col,train_df
    gc.collect()
    return sample, mark + "_" + "_".join(key) + '_%s_' % type_c + on


In [3]:
path="../data/"
train_data=pd.read_csv(path+"train.csv")
test_data=pd.read_csv(path+"test.csv")

In [4]:
train_data.columns

Index(['article_id', 'date', 'baike_id_1h', 'price', 'price_diff', 'author',
       'level1', 'level2', 'level3', 'level4', 'brand', 'mall', 'url',
       'comments_1h', 'zhi_1h', 'buzhi_1h', 'favorite_1h', 'orders_1h',
       'baike_id_2h', 'comments_2h', 'zhi_2h', 'buzhi_2h', 'favorite_2h',
       'orders_2h', 'orders_3h_15h'],
      dtype='object')

In [5]:
def create_features(data):
    data["order_1h_2h"]=data["orders_1h"]+data["orders_2h"]
    
    date_max=data.groupby(["date"]).article_id.max().reset_index()
    date_max.columns=["date","date_max"]
    data_min=date_max.copy()
    data_min.columns=["date","date_min"]
    data_min["date"]=data_min["date"]+1
    
    data=data.merge(date_max,on="date",how="left")
    data=data.merge(data_min,on="date",how="left")
    data["date_min"]=data["date_min"].fillna(0)
    
    data["article_id"]=(data["article_id"]-data["date_min"])/(data["date_max"]-data["date_min"])
    
    data['hour']=data["article_id"].apply(lambda x:int(x*24))
    
    data["dayofweek"]=data["date"]%7
    
    del data["date_min"],data["date_max"]
    
    
    cat_features=[['baike_id_1h'],['author'],['level1'], ['level2'], ['level3'], ['level4'], ['brand'], ['mall'], ['url'],['dayofweek'],
                  ['author','level1'],['author','level2'],['author','level3'],['author','level4'],['author','brand'],['author','mall'],
                  ['mall','brand'],['mall','level1'],['mall','level2'],['mall','level3'],['mall','level4'],
                  ['brand','level1'],['brand','level2'],['brand','level3'],['brand','level4'],
                  ['baike_id_1h','mall'],
                  ["mall","brand","level1"],["mall","brand","level2"],["mall","brand","level3"],["mall","brand","level4"]

                 ]
    num_features=['comments_1h','comments_2h',
                  'zhi_1h','zhi_2h',
                  'buzhi_1h','buzhi_2h',
                  'favorite_1h','favorite_2h',
                  'orders_1h','orders_2h','order_1h_2h','article_id']
    
    for cat in cat_features:
        data["%s_%s_-1"%("_".join(cat),"article_id")]=data.groupby(cat)["article_id"].shift(-1)-data["article_id"]
        data["%s_%s_1"%("_".join(cat),"article_id")]=data.groupby(cat)["article_id"].shift(1)-data["article_id"]

        data,_=get_type_feature_all(data, data, cat,"%s_%s_-1"%("_".join(cat),"article_id") , "mean", "fe")
        data,_=get_type_feature_all(data, data, cat,"%s_%s_1"%("_".join(cat),"article_id") , "mean", "fe")
    
    for cat in cat_features:
        data,_=get_type_feature_all(data, data, cat+["hour"],"article_id" , "mean", "fe")
        data,_=get_type_feature_all(data, data, cat,"article_id" , "count", "fe")
        
        data,_=get_type_feature_all(data, data, cat,"price" , "mean", "fe")
        data,_=get_type_feature_all(data, data, cat,"price" , "max", "fe")
        data,_=get_type_feature_all(data, data, cat,"price" , "min", "fe")
        

        for num in num_features:
            data,_=get_type_feature_all(data, data, cat,num , "mean", "fe")
            data,_=get_type_feature_all(data, data, cat,num , "sum", "fe")
    
    return data

In [6]:
data=pd.concat([train_data,
                test_data,
               ]).reset_index(drop=True)

data=create_features(data)

load fe mean feature from pickle file: key: baike_id_1h, on: baike_id_1h_article_id_-1...
load fe mean feature from pickle file: key: baike_id_1h, on: baike_id_1h_article_id_1...
load fe mean feature from pickle file: key: author, on: author_article_id_-1...
load fe mean feature from pickle file: key: author, on: author_article_id_1...
load fe mean feature from pickle file: key: level1, on: level1_article_id_-1...
load fe mean feature from pickle file: key: level1, on: level1_article_id_1...
load fe mean feature from pickle file: key: level2, on: level2_article_id_-1...
load fe mean feature from pickle file: key: level2, on: level2_article_id_1...
load fe mean feature from pickle file: key: level3, on: level3_article_id_-1...
load fe mean feature from pickle file: key: level3, on: level3_article_id_1...
load fe mean feature from pickle file: key: level4, on: level4_article_id_-1...
load fe mean feature from pickle file: key: level4, on: level4_article_id_1...
load fe mean feature from 

load fe sum feature from pickle file: key: author, on: zhi_2h...
load fe mean feature from pickle file: key: author, on: buzhi_1h...
load fe sum feature from pickle file: key: author, on: buzhi_1h...
load fe mean feature from pickle file: key: author, on: buzhi_2h...
load fe sum feature from pickle file: key: author, on: buzhi_2h...
load fe mean feature from pickle file: key: author, on: favorite_1h...
load fe sum feature from pickle file: key: author, on: favorite_1h...
load fe mean feature from pickle file: key: author, on: favorite_2h...
load fe sum feature from pickle file: key: author, on: favorite_2h...
load fe mean feature from pickle file: key: author, on: orders_1h...
load fe sum feature from pickle file: key: author, on: orders_1h...
load fe mean feature from pickle file: key: author, on: orders_2h...
load fe sum feature from pickle file: key: author, on: orders_2h...
load fe mean feature from pickle file: key: author, on: order_1h_2h...
load fe sum feature from pickle file: 

load fe sum feature from pickle file: key: level4, on: buzhi_2h...
load fe mean feature from pickle file: key: level4, on: favorite_1h...
load fe sum feature from pickle file: key: level4, on: favorite_1h...
load fe mean feature from pickle file: key: level4, on: favorite_2h...
load fe sum feature from pickle file: key: level4, on: favorite_2h...
load fe mean feature from pickle file: key: level4, on: orders_1h...
load fe sum feature from pickle file: key: level4, on: orders_1h...
load fe mean feature from pickle file: key: level4, on: orders_2h...
load fe sum feature from pickle file: key: level4, on: orders_2h...
load fe mean feature from pickle file: key: level4, on: order_1h_2h...
load fe sum feature from pickle file: key: level4, on: order_1h_2h...
load fe mean feature from pickle file: key: level4, on: article_id...
load fe sum feature from pickle file: key: level4, on: article_id...
load fe mean feature from pickle file: key: brand_hour, on: article_id...
load fe count feature f

load fe sum feature from pickle file: key: dayofweek, on: orders_1h...
load fe mean feature from pickle file: key: dayofweek, on: orders_2h...
load fe sum feature from pickle file: key: dayofweek, on: orders_2h...
load fe mean feature from pickle file: key: dayofweek, on: order_1h_2h...
load fe sum feature from pickle file: key: dayofweek, on: order_1h_2h...
load fe mean feature from pickle file: key: dayofweek, on: article_id...
load fe sum feature from pickle file: key: dayofweek, on: article_id...
load fe mean feature from pickle file: key: author_level1_hour, on: article_id...
load fe count feature from pickle file: key: author_level1, on: article_id...
load fe mean feature from pickle file: key: author_level1, on: price...
load fe max feature from pickle file: key: author_level1, on: price...
load fe min feature from pickle file: key: author_level1, on: price...
load fe mean feature from pickle file: key: author_level1, on: comments_1h...
load fe sum feature from pickle file: key:

load fe sum feature from pickle file: key: author_level4, on: buzhi_2h...
load fe mean feature from pickle file: key: author_level4, on: favorite_1h...
load fe sum feature from pickle file: key: author_level4, on: favorite_1h...
load fe mean feature from pickle file: key: author_level4, on: favorite_2h...
load fe sum feature from pickle file: key: author_level4, on: favorite_2h...
load fe mean feature from pickle file: key: author_level4, on: orders_1h...
load fe sum feature from pickle file: key: author_level4, on: orders_1h...
load fe mean feature from pickle file: key: author_level4, on: orders_2h...
load fe sum feature from pickle file: key: author_level4, on: orders_2h...
load fe mean feature from pickle file: key: author_level4, on: order_1h_2h...
load fe sum feature from pickle file: key: author_level4, on: order_1h_2h...
load fe mean feature from pickle file: key: author_level4, on: article_id...
load fe sum feature from pickle file: key: author_level4, on: article_id...
load f

load fe sum feature from pickle file: key: mall_level1, on: zhi_2h...
load fe mean feature from pickle file: key: mall_level1, on: buzhi_1h...
load fe sum feature from pickle file: key: mall_level1, on: buzhi_1h...
load fe mean feature from pickle file: key: mall_level1, on: buzhi_2h...
load fe sum feature from pickle file: key: mall_level1, on: buzhi_2h...
load fe mean feature from pickle file: key: mall_level1, on: favorite_1h...
load fe sum feature from pickle file: key: mall_level1, on: favorite_1h...
load fe mean feature from pickle file: key: mall_level1, on: favorite_2h...
load fe sum feature from pickle file: key: mall_level1, on: favorite_2h...
load fe mean feature from pickle file: key: mall_level1, on: orders_1h...
load fe sum feature from pickle file: key: mall_level1, on: orders_1h...
load fe mean feature from pickle file: key: mall_level1, on: orders_2h...
load fe sum feature from pickle file: key: mall_level1, on: orders_2h...
load fe mean feature from pickle file: key: 

load fe sum feature from pickle file: key: brand_level1, on: comments_2h...
load fe mean feature from pickle file: key: brand_level1, on: zhi_1h...
load fe sum feature from pickle file: key: brand_level1, on: zhi_1h...
load fe mean feature from pickle file: key: brand_level1, on: zhi_2h...
load fe sum feature from pickle file: key: brand_level1, on: zhi_2h...
load fe mean feature from pickle file: key: brand_level1, on: buzhi_1h...
load fe sum feature from pickle file: key: brand_level1, on: buzhi_1h...
load fe mean feature from pickle file: key: brand_level1, on: buzhi_2h...
load fe sum feature from pickle file: key: brand_level1, on: buzhi_2h...
load fe mean feature from pickle file: key: brand_level1, on: favorite_1h...
load fe sum feature from pickle file: key: brand_level1, on: favorite_1h...
load fe mean feature from pickle file: key: brand_level1, on: favorite_2h...
load fe sum feature from pickle file: key: brand_level1, on: favorite_2h...
load fe mean feature from pickle file:

load fe mean feature from pickle file: key: baike_id_1h_mall, on: price...
load fe max feature from pickle file: key: baike_id_1h_mall, on: price...
load fe min feature from pickle file: key: baike_id_1h_mall, on: price...
load fe mean feature from pickle file: key: baike_id_1h_mall, on: comments_1h...
load fe sum feature from pickle file: key: baike_id_1h_mall, on: comments_1h...
load fe mean feature from pickle file: key: baike_id_1h_mall, on: comments_2h...
load fe sum feature from pickle file: key: baike_id_1h_mall, on: comments_2h...
load fe mean feature from pickle file: key: baike_id_1h_mall, on: zhi_1h...
load fe sum feature from pickle file: key: baike_id_1h_mall, on: zhi_1h...
load fe mean feature from pickle file: key: baike_id_1h_mall, on: zhi_2h...
load fe sum feature from pickle file: key: baike_id_1h_mall, on: zhi_2h...
load fe mean feature from pickle file: key: baike_id_1h_mall, on: buzhi_1h...
load fe sum feature from pickle file: key: baike_id_1h_mall, on: buzhi_1h..

load fe mean feature from pickle file: key: mall_brand_level3, on: favorite_2h...
load fe sum feature from pickle file: key: mall_brand_level3, on: favorite_2h...
load fe mean feature from pickle file: key: mall_brand_level3, on: orders_1h...
load fe sum feature from pickle file: key: mall_brand_level3, on: orders_1h...
load fe mean feature from pickle file: key: mall_brand_level3, on: orders_2h...
load fe sum feature from pickle file: key: mall_brand_level3, on: orders_2h...
load fe mean feature from pickle file: key: mall_brand_level3, on: order_1h_2h...
load fe sum feature from pickle file: key: mall_brand_level3, on: order_1h_2h...
load fe mean feature from pickle file: key: mall_brand_level3, on: article_id...
load fe sum feature from pickle file: key: mall_brand_level3, on: article_id...
load fe mean feature from pickle file: key: mall_brand_level4_hour, on: article_id...
load fe count feature from pickle file: key: mall_brand_level4, on: article_id...
load fe mean feature from p

In [11]:
print(data.shape)

(1957246, 1018)


In [8]:
features = list(data[:1].drop(['date', 'orders_3h_15h'],axis=1).columns)

In [9]:
train_ind=data[data.date<110].index
valid_ind=data[(data.date>=110)&(data.date<117)].index
test_ind=data[data.date>=117].index

data_x=data[features].fillna(0).values

nor=StandardScaler()
data_x = nor.fit_transform(data_x)

train_x = data_x[train_ind]
valid_x = data_x[valid_ind]
test_x = data_x[test_ind]

train_y = data["orders_3h_15h"].values[train_ind].astype('float64')
valid_y = data["orders_3h_15h"].values[valid_ind].astype('float64')

In [10]:
def get_model():
    input_num = Input(shape=(len(features),))
    
    fc = Dense(2048,activation='relu')(input_num)
    fc = BatchNormalization()(fc)

    fc = Dense(1024,activation='relu')(fc)
    fc = BatchNormalization()(fc)

    fc = Dense(512,activation='relu')(fc)
    fc = BatchNormalization()(fc)
    
    fc = Dense(256,activation='relu')(fc)
    fc = BatchNormalization()(fc)
    
    preds = Dense(1,activation='linear')(fc)

    model = Model(inputs=input_num, outputs=preds)
    return model

In [1]:
sub_train=train_data[(train_data.date>=110)&(train_data.date<117)][["article_id"]].copy()
sub_test=test_data[["article_id"]].copy()

for i in range(5):
    model=get_model()
    opt = Adam(lr=0.001, clipnorm=2., amsgrad=False)
    model.compile(loss="mean_squared_error",optimizer=opt,metrics=[])

    early_stop = EarlyStopping(patience=2)
    check_point = ModelCheckpoint('best_model_reg.hdf5', monitor="val_loss", mode="min", save_best_only=True, verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                                  patience=1, min_lr=0.0001)
    model.fit(train_x,train_y,validation_data=(valid_x,valid_y),
                        batch_size=1024, epochs=10, verbose=1,#shuffle=False,
                        callbacks=[early_stop, check_point,reduce_lr]
                       )
    model.load_weights('automl/best_model_reg.hdf5')
    
    sub_train["pred_nn_%s"%i]=model.predict(valid_x,batch_size=1024)[:,0]
    sub_test["pred_nn_%s"%i]=model.predict(test_x,batch_size=1024)[:,0]
    
sub_train.to_csv("../user_data/train_stacking_nn_reg_2.csv",index=None)
sub_test.to_csv("../user_data/test_stacking_nn_reg_2.csv",index=None)

NameError: name 'train_data' is not defined