In [1]:
import tensorflow as tf

import pandas as pd
import numpy as np

import config

from sklearn.model_selection import StratifiedKFold
from DataLoader import FeatureDictionary, DataParser

from model import XdeepFM

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def load_data():
    dfTrain = pd.read_csv(config.TRAIN_FILE)
    dfTest = pd.read_csv(config.TEST_FILE)

    def preprocess(df):
        cols = [c for c in df.columns if c not in ["id", "target"]]
        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
        df["ps_car_13_x_ps_reg_03"] = df["ps_car_13"] * df["ps_reg_03"]
        return df

    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)

    cols = [c for c in dfTrain.columns if c not in ["id", "target"]]
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]

    X_train = dfTrain[cols].values
    y_train = dfTrain["target"].values
    X_test = dfTest[cols].values
    ids_test = dfTest["id"].values

    return dfTrain, dfTest, X_train, y_train, X_test, ids_test,

In [3]:
dfTrain, dfTest, X_train, y_train, X_test, ids_test = load_data()
print('load_data_over')
folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
                             random_state=config.RANDOM_SEED).split(X_train, y_train))
print('process_data_over')

XdeepFM_params = {

    "embedding_size": 8,
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "deep_layers_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer_type": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "random_seed": config.RANDOM_SEED,
    "cin_layer":[124,124]
}

load_data_over
process_data_over


In [4]:
folds

[(array([   0,    1,    4, ..., 9996, 9997, 9999]),
  array([   2,    3,    8, ..., 9991, 9994, 9998])),
 (array([   0,    1,    2, ..., 9996, 9997, 9998]),
  array([   4,    5,    7, ..., 9992, 9993, 9999])),
 (array([   2,    3,    4, ..., 9994, 9998, 9999]),
  array([   0,    1,    6, ..., 9995, 9996, 9997]))]

In [5]:
fd = FeatureDictionary(dfTrain,dfTest,numeric_cols=config.NUMERIC_COLS,
                    ignore_cols=config.IGNORE_COLS,
                        cate_cols = config.CATEGORICAL_COLS)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df = pd.concat([self.trainfile,self.testfile])


In [6]:
print(fd.feat_dim)
print(fd.feat_dict)

247
{'ps_car_01_cat': {10: 0, 11: 1, 7: 2, 6: 3, 9: 4, 5: 5, 4: 6, 8: 7, 3: 8, 0: 9, 2: 10, 1: 11, -1: 12}, 'ps_car_02_cat': {1: 13, 0: 14}, 'ps_car_03_cat': {-1: 15, 0: 16, 1: 17}, 'ps_car_04_cat': {0: 18, 1: 19, 8: 20, 9: 21, 2: 22, 6: 23, 3: 24, 7: 25, 4: 26, 5: 27}, 'ps_car_05_cat': {1: 28, -1: 29, 0: 30}, 'ps_car_06_cat': {4: 31, 11: 32, 14: 33, 13: 34, 6: 35, 15: 36, 3: 37, 0: 38, 1: 39, 10: 40, 12: 41, 9: 42, 17: 43, 7: 44, 8: 45, 5: 46, 2: 47, 16: 48}, 'ps_car_07_cat': {1: 49, -1: 50, 0: 51}, 'ps_car_08_cat': {0: 52, 1: 53}, 'ps_car_09_cat': {0: 54, 2: 55, 3: 56, 1: 57, -1: 58, 4: 59}, 'ps_car_10_cat': {1: 60, 0: 61, 2: 62}, 'ps_car_11': {2: 63, 3: 64, 1: 65, 0: 66}, 'ps_car_11_cat': {12: 67, 19: 68, 60: 69, 104: 70, 82: 71, 99: 72, 30: 73, 68: 74, 20: 75, 36: 76, 101: 77, 103: 78, 41: 79, 59: 80, 43: 81, 64: 82, 29: 83, 95: 84, 24: 85, 5: 86, 28: 87, 87: 88, 66: 89, 10: 90, 26: 91, 54: 92, 32: 93, 38: 94, 83: 95, 89: 96, 49: 97, 93: 98, 1: 99, 22: 100, 85: 101, 78: 102, 31: 10

In [7]:
# 对特征进行转换，
data_parser = DataParser(feat_dict=fd)
cate_Xi_train, cate_Xv_train, numeric_Xv_train,y_train = data_parser.parse(df=dfTrain, has_label=True)

In [8]:
len(cate_Xi_train[0]),len(cate_Xv_train[0]),len(numeric_Xv_train),len(y_train)

(30, 30, 10000, 10000)

In [9]:
    XdeepFM_params["cate_feature_size"] = fd.feat_dim
    XdeepFM_params["field_size"] = len(cate_Xi_train[0])
    XdeepFM_params['numeric_feature_size'] = len(config.NUMERIC_COLS)

In [10]:
_get = lambda x, l: [x[i] for i in l]

for i, (train_idx, valid_idx) in enumerate(folds):
    cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_ = _get(cate_Xi_train, train_idx), _get(cate_Xv_train, train_idx),_get(numeric_Xv_train, train_idx), _get(y_train, train_idx)
    cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_ = _get(cate_Xi_train, valid_idx), _get(cate_Xv_train, valid_idx),_get(numeric_Xv_train, valid_idx), _get(y_train, valid_idx)
    print(len(cate_Xi_train_),len(cate_Xi_train_[0])) 
    print(len(cate_Xv_train_),len(cate_Xv_train_[0]))    
    print(len(numeric_Xv_train_),len(numeric_Xv_train_[0]))
    print(len(y_train))

        
    
    Xdeepfm = XdeepFM(**XdeepFM_params)
    break
    Xdeepfm.fit(cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_, cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_)

6666 30
6666 30
6666 9
10000



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
xo (?, ?)
[<tf.Tensor 'transpose_3:0' shape=(?, 124, 8) dtype=float32>]
(?, 1)
(?, 156)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.





#params: 11436


In [11]:
Xdeepfm

XdeepFM(batch_norm=1, batch_norm_decay=0.995, batch_size=1024,
        cate_feature_size=247, cin_layer=[124, 124], deep_layers=[32, 32],
        deep_layers_activation=<function relu at 0x0000027AC451B1E0>,
        dropout_deep=None, embedding_size=8, epoch=30,
        eval_metric=<function roc_auc_score at 0x0000027ACB015268>,
        field_size=30, greater_is_better=True, l2_reg=0.01, learning_rate=0.001,
        loss_type='logloss', numeric_feature_size=9, optimizer_type='adam',
        random_seed=2017, verbose=True)