@Copyright IQIYI 2021
http://challenge.ai.iqiyi.com/

In [1]:
import pandas as pd
import numpy as np
import json
import math
import tensorflow as tf

In [2]:
data_dir = "~/wsdm_model_data/"

In [3]:
class DataGenerator:
    def __init__(self, df, batch_size):
        self.data = df
        self.num = df.shape[0]
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(self.num / self.batch_size)

    def __iter__(self):
        while True:
            input_1, input_2, output = [], [], []
            for row in self.data.itertuples():
                idx = row.Index
                seq = [row.launch_seq, row.playtime_seq]
                fea = row.duration_prefer + row.interact_prefer + list(row[7:18])
                input_1.append(np.array(seq))
                input_2.append(np.array(fea))
                output.append(row.label)
                if len(input_1) == self.batch_size or idx == self.num - 1:
                    input_1 = np.array(input_1).transpose([0, 2, 1])
                    input_2 = np.array(input_2)
                    output = np.array(output)
                    yield (input_1, input_2), output
                    input_1, input_2, output = [], [], []

In [4]:
def build_model(seq_feature_num, seq_len, feature_num):
    input_1 = tf.keras.Input(shape=(seq_len, seq_feature_num))
    output_1 = tf.keras.layers.GRU(64)(input_1)

    input_2 = tf.keras.Input(shape=(feature_num, ))
    layer = tf.keras.layers.Dense(256, activation="elu")(input_2)
    layer = tf.keras.layers.Dense(128, activation="elu")(layer)
    output_2 = tf.keras.layers.Dense(64, activation="elu")(layer)

    output = tf.concat([output_1, output_2], -1)
    output = tf.keras.layers.Dense(1, activation="relu")(output)

    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    return model

# train

In [5]:
# train data
data = pd.read_csv(data_dir + "train_data.txt", sep="\t")
data["launch_seq"] = data.launch_seq.apply(lambda x: json.loads(x))
data["playtime_seq"] = data.playtime_seq.apply(lambda x: json.loads(x))
data["duration_prefer"] = data.duration_prefer.apply(lambda x: json.loads(x))
data["interact_prefer"] = data.interact_prefer.apply(lambda x: json.loads(x))

In [6]:
# shuffle data
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,user_id,end_date,label,launch_seq,playtime_seq,duration_prefer,father_id_score,cast_id_score,tag_score,device_type,device_ram,device_rom,sex,age,education,occupation_status,territory_score,interact_prefer
0,10525290,163,1,"[0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0.0635, 0, 0, 0.0574, 0,...","[0.0, 0.0, 0.0, 0.0, 0.1667, 0.5, 1.0, 0.1667,...",-3.274239,0.000000,-1.650953,0.194954,0.000000,-0.179492,-0.955892,-0.319111,-0.544818,0.746096,0.232332,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,10530792,139,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,0.194954,-0.747565,-0.823301,1.046141,0.828011,-0.544818,-1.340308,-0.327354,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,10205897,114,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,0.194954,1.013106,-0.145958,1.046141,-0.319111,-0.544818,0.746096,1.655552,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,10400412,194,0,"[0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...","[0, 0.9978, 0.0549, 0, 1.0, 0.9985, 0, 0, 0, 0...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, ...",-1.399713,-1.063328,0.087159,0.194954,-0.787841,-0.165363,-0.955892,-0.319111,-0.544818,0.746096,1.087805,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,10045542,168,6,"[0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...","[0, 0, 0.7428, 0.9994, 0.4642, 0, 0, 0.9999, 0...","[0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.2, 0.0, ...",1.011272,-0.818263,0.237653,-2.041925,-0.678998,1.514934,-0.955892,0.828011,2.055850,0.746096,0.607717,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599996,10070639,172,0,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0483, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[0.0, 0.0, 0.0, 0.3333, 0.0, 0.3333, 1.0, 0.33...",-1.302537,-0.358272,-0.010873,0.194954,-0.828598,-0.802015,-0.955892,-0.319111,-0.544818,0.746096,-0.538971,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
599997,10310113,153,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,0.194954,0.000000,0.000000,-0.955892,0.828011,-0.544818,0.746096,0.000000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
599998,10156594,193,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,-2.041925,-0.678039,1.514934,-0.955892,0.828011,2.055850,-1.340308,1.014973,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
599999,10275473,134,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.1667, 0.6667, 1.0,...",0.938242,0.000000,1.551262,-2.041925,-1.530085,0.038613,-0.955892,0.828011,2.055850,0.746096,0.567215,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [7]:
# testing DataGenerator
generator_test = DataGenerator(data[:20], batch_size=8)
for i, item in enumerate(iter(generator_test)):
    if(i == len(generator_test)):
        break
    (input_1, input_2), output = item
    print(i, input_1.shape, input_2.shape)
    print(i, output.shape, output)

0 (8, 32, 2) (8, 38)
0 (8,) [1 0 0 0 6 0 0 2]
1 (8, 32, 2) (8, 38)
1 (8,) [6 0 2 0 1 0 0 0]
2 (4, 32, 2) (4, 38)
2 (4,) [1 4 1 6]


In [8]:
model = build_model(seq_feature_num=2, seq_len=32, feature_num=38)
model.summary()

2021-11-16 19:08:02.943256: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2021-11-16 19:08:02.951582: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2294110000 Hz
2021-11-16 19:08:02.953972: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3f68190 executing computations on platform Host. Devices:
2021-11-16 19:08:02.954000: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): Host, Default Version


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 38)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          9984        input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 32, 2)]      0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          32896       dense[0][0]                      
______________________________________________________________________________________________

In [9]:
train = DataGenerator(data.loc[30001:], 128)
dev = DataGenerator(data.loc[:30000], 64)

In [10]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss="mae",
    metrics=["mse"]
)

In [11]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", 
    patience=3, 
    restore_best_weights=True
)

In [12]:
model.fit(
    iter(train),
    steps_per_epoch=len(train),
    validation_data=iter(dev),
    validation_steps=len(dev),
    epochs=20,
    callbacks=[early_stopping]
)

Train for 4454 steps, validate for 469 steps
Epoch 1/20


2021-11-16 19:08:25.292312: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_cudnn_gru_with_fallback_2506_2645' and '__inference___backward_standard_gru_2772_3329_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_3471' both implement 'gru_bf2618e5-42d6-4f13-ac41-ed1fb66ff20b' but their signatures do not match.




2021-11-16 19:12:02.672192: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_standard_gru_17047_specialized_for_model_gru_StatefulPartitionedCall_at___inference_distributed_function_17336' and '__inference_cudnn_gru_with_fallback_17136' both implement 'gru_1e176764-74ca-4488-be76-55f2faea33d5' but their signatures do not match.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


<tensorflow.python.keras.callbacks.History at 0x7fb67798ef10>

In [13]:
model.save(data_dir + "best_weights.h5")

# predict

In [14]:
data = pd.read_csv(data_dir + "test_data.txt", sep="\t")
data["launch_seq"] = data.launch_seq.apply(lambda x: json.loads(x))
data["playtime_seq"] = data.playtime_seq.apply(lambda x: json.loads(x))
data["duration_prefer"] = data.duration_prefer.apply(lambda x: json.loads(x))
data["interact_prefer"] = data.interact_prefer.apply(lambda x: json.loads(x))

test = DataGenerator(data, 64)
# can also load model from saved weights
# model = build_model(seq_feature_num=2, seq_len=32, feature_num=38)
# model.load_weights(data_dir + "best_weights.h5")
prediction = model.predict(iter(test), steps=len(test))
prediction

2021-11-17 09:42:17.016232: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_standard_gru_196469_specialized_for_model_gru_StatefulPartitionedCall_at___inference_distributed_function_196715' and '__inference_standard_gru_196469' both implement 'gru_22c19379-1a8e-4104-82b2-4c6257735474' but their signatures do not match.


array([[ 5.1163882e-04],
       [ 7.7824295e-04],
       [ 6.9034106e-01],
       ...,
       [ 5.7920069e-04],
       [ 5.5324608e-01],
       [-1.1333227e-03]], dtype=float32)

In [15]:
data["prediction"] = np.reshape(prediction, -1)
data = data[["user_id", "prediction"]]
# can clip outputs to [0, 7] or use other tricks
data

Unnamed: 0,user_id,prediction
0,10007813,0.000512
1,10052988,0.000778
2,10279068,0.690341
3,10546696,0.192047
4,10406659,0.000398
...,...,...
14996,10355586,0.000019
14997,10589773,6.347603
14998,10181954,0.000579
14999,10544736,0.553246


In [16]:
data.to_csv(data_dir + "baseline_submission.csv", index=False, header=False, float_format="%.2f")