In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv("../data2/data/age_train.csv",names=['uid','age_group']).sort_values(by=['uid'])
test = pd.read_csv("../data2/data/age_test.csv",names=['uid']).sort_values(by=['uid'])
info = pd.read_csv("../data2/data/app_info.csv",names=['appid','category'])
active = pd.read_csv("../data2/data/user_app_actived.csv",names=['uid','appid']).sort_values(by=['uid'])
usage = pd.read_pickle("../data2/user_app_usage.pickle")#,names=['uid','appid','duration','times','use_date'],parse_dates=['use_date'])
user_basic_info = pd.read_csv("../data2/data/user_basic_info.csv",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])
behavior_info = pd.read_csv("../data2/data/user_behavior_info.csv",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])
# (train.shape,test.shape),(info.shape,active.shape,usage.shape,user_basic_info.shape,behavior_info.shape)

all_data = train.append(test).reset_index(drop=True)
all_data.head()

active['appid'] = active['appid'].map(lambda x:x.split('#'))
active['app_len'] = active['appid'].map(lambda x:len(x))

def get_category(x):
    col = []
    no_col = 0
    for i in x:
        try:
            col.append(hash_dict[i])
        except:
            no_col+=1
    return col,no_col

hash_dict = dict(info.values)
active['category'] = active['appid'].map(lambda x:get_category(x))
active['category_nan']  = active['category'].map(lambda x:x[1])
active['category']  = active['category'].map(lambda x:x[0])
active['category_len'] = active['category'].map(lambda x:len(x))
active['category_nunique'] = active['category'].map(lambda x:len(set(x)))
active['category_ratio'] = active['category_nunique']/active['category_len']
del active['category']

all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)
all_data = all_data.merge(user_basic_info,how='left',on=['uid'])
all_data = all_data.merge(behavior_info,how='left',on=['uid'])
all_data = all_data.merge(active[active['app_len']<=150],how='left',on=['uid'])
lj = active[active['app_len']>150]
active = pd.read_csv("../data2/data/user_app_actived.csv",names=['uid','appid']).sort_values(by=['uid'])
active.reset_index(drop=True,inplace=True)

active.loc[active['uid'].isin(lj['uid'].unique()),'appid'] = '#a00101827'
all_data = all_data.merge(active.rename(columns={'appid' : 'multi_appid'}),how='left',on='uid')

def split(x):
    key_ans = x.split('#')
    for key in key_ans:
        if key not in key2index:
            # Notice : data value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence data
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

all_data['multi_appid'] = all_data['multi_appid'].astype('str')
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# 多值特征处理
from deepctr.models import *
from deepctr.datas import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names,DenseFeat
key2index = {}
app_list = list(map(split, all_data['multi_appid'].values))
app_length = np.array(list(map(len, app_list)))
app_key = key2index.copy()

max_len_app = max(app_length)
app_list = pad_sequences(app_list, maxlen=max_len_app, padding='post', )
print(max_len_app)

uid_seq = pd.read_pickle("usage_uid_appid_seq.pickle")
uid_seq['appid_len'] = uid_seq['appid'].map(lambda x:len(x))
uid_seq = uid_seq[uid_seq['appid_len']<=300]
uid_seq['appid'] = uid_seq['appid'].map(lambda x:"#".join(x))

del uid_seq['appid_len']

all_data = all_data.merge(uid_seq.rename(columns={'appid' : 'usage_seq'}),how='left',on='uid')

all_data['usage_seq'] = all_data['usage_seq'].astype('str')
key2index = {}
app1_list = list(map(split, all_data['usage_seq'].values))
app1_length = np.array(list(map(len, app1_list)))
app1_key = key2index.copy()

max_len_app1 = max(app1_length)
app1_list = pad_sequences(app1_list, maxlen=max_len_app1, padding='post', )
print(max_len_app1)

from sklearn.preprocessing import MinMaxScaler,StandardScaler
from tqdm import tqdm

sparse_features = [i for i in all_data.select_dtypes(object).columns if i not in ['uid','age_group']]
dense_features = [i for i in all_data.columns if i not in sparse_features+['uid','age_group']]
target = ['age_group']
for feat in tqdm(sparse_features):
    lbl = LabelEncoder()
    all_data[feat] = lbl.fit_transform(all_data[feat].astype('str'))

mm = StandardScaler()
all_data[dense_features] = mm.fit_transform(all_data[dense_features].replace([np.inf,-np.inf],0).fillna(0))

choose = all_data['age_group'].notnull()
fixlen_feature_columns = [SparseFeat(feat,all_data[feat].nunique()) for feat in sparse_features] + [DenseFeat(feat,1,) for feat in dense_features]
varlen_feature_columns = [VarLenSparseFeat('app', len(app_key) + 1, max_len_app, 'mean')] + [VarLenSparseFeat('app1', len(app_key) + 1, max_len_app, 'max')] + \
                         [VarLenSparseFeat('usage_app',len(app1_key) + 1, max_len_app1, 'mean')] + [VarLenSparseFeat('usage_app1', len(app1_key) + 1, max_len_app1,'max')]
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
varlen_feature_names = get_varlen_feature_names(linear_feature_columns + dnn_feature_columns)
len(sparse_features),len(dense_features)

import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.initializers import Zeros, glorot_normal
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras.regularizers import l2
from deepctr.datas import *
from deepctr.contrib import *
from deepctr.layers import *
from deepctr.models import *
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import concatenate
from keras.callbacks import *
from keras.constraints import *
from keras.layers import *
from keras.models import *
from keras.initializers import *
from keras.optimizers import *

label_name = 'age_group'
all_data[label_name] = all_data[label_name] - 1
all_data[label_name].value_counts()

# from tf.keras.activations.softplus

def xDeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, dnn_hidden_units=(512, 256),
            cin_layer_size=(256, 256,), cin_split_half=True, cin_activation='relu', l2_reg_linear=0.00001,
            l2_reg_embedding=0.00001, l2_reg_dnn=0, l2_reg_cin=0, init_std=0.0001, seed=2019, dnn_dropout=0,
            dnn_activation='relu', dnn_use_bn=True, task='binary'):
    """Instantiates the xDeepFM architecture.
    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
    :param embedding_size: positive integer,sparse feature embedding_size
    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net
    :param cin_layer_size: list,list of positive integer or empty list, the feature maps  in each hidden layer of Compressed Interaction Network
    :param cin_split_half: bool.if set to True, half of the feature maps in each hidden will connect to output unit
    :param cin_activation: activation function used on feature maps
    :param l2_reg_linear: float. L2 regularizer strength applied to linear part
    :param l2_reg_embedding: L2 regularizer strength applied to embedding vector
    :param l2_reg_dnn: L2 regularizer strength applied to deep net
    :param l2_reg_cin: L2 regularizer strength applied to CIN.
    :param init_std: float,to use as the initialize std of embedding vector
    :param seed: integer ,to use as random seed.
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param dnn_activation: Activation function to use in DNN
    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
    :return: A Keras model instance.
    """


    features = build_data_features(linear_feature_columns + dnn_feature_columns)

    datas_list = list(features.values())

    sparse_embedding_list, dense_value_list = data_from_feature_columns(features,dnn_feature_columns,
                                                                              embedding_size,
                                                                              l2_reg_embedding,init_std,
                                                                              seed)

#     linear_logit 
    feature_columns = linear_feature_columns
    prefix = 'linear'
    units = 6
    l2_reg = l2_reg_linear
    linear_emb_list = [data_from_feature_columns(features,feature_columns,1,l2_reg,init_std,seed,prefix=prefix+str(i))[0] for i in range(units)]
    _, dense_data_list = data_from_feature_columns(features,feature_columns,1,l2_reg,init_std,seed,prefix=prefix)
    

    if len(linear_emb_list[0]) > 1:
        linear_term = concat_fun([tf.keras.layers.add(linear_emb) for linear_emb in linear_emb_list])
    elif len(linear_emb_list[0]) == 1:
        linear_term = concat_fun([linear_emb[0] for linear_emb in linear_emb_list])
    else:
        linear_term = None
    
    if len(dense_data_list) > 0:
        dense_data__ = dense_data_list[0] if len(
            dense_data_list) == 1 else tf.keras.layers.Concatenate()(dense_data_list)
        linear_dense_logit = tf.keras.layers.Dense(
            units, activation='softplus', use_bias=True, kernel_regularizer=l2(l2_reg))(dense_data__)
        
        if linear_term is not None:
            linear_term = tf.keras.layers.add([linear_dense_logit, linear_term])
        else:
            linear_term = linear_dense_logit
    
    linear_logit = tf.keras.layers.Flatten()(linear_term)

    fm_data = concat_fun(sparse_embedding_list, axis=1)

    if len(cin_layer_size) > 0:
        exFM_out = CIN(cin_layer_size, cin_activation,
                       cin_split_half, l2_reg_cin, seed)(fm_data)
        exFM_logit = tf.keras.layers.Dense(6, activation='softplus', )(exFM_out)
        exFM_logit_reg = tf.keras.layers.Dense(1, activation='relu')(exFM_out)

    dnn_data_1 = combined_dnn_data(sparse_embedding_list,dense_value_list)
    
    deep_out_1 = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                   dnn_use_bn, seed)(dnn_data_1)
    
    deep_logit_1 = tf.keras.layers.Dense(
        6, use_bias=False, activation='softmax')(deep_out_1)

    x = tf.keras.layers.average([exFM_logit,linear_logit,deep_logit_1])
    x = tf.keras.layers.concatenate([x,exFM_logit_reg])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(256)(x)
    x = tf.keras.layers.PReLU()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    output = tf.keras.layers.Dense(6,activation='softmax')(x)
    model = tf.keras.models.Model(datas=datas_list, outputs=output)
    return model

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

from keras.utils import np_utils

def make_label(x):
    return np_utils.to_categorical(x)

def make_data(JB,index):
    JB = JB.iloc[index]
    fixlen_data = [JB[name].values for name in fixlen_feature_names]
    v0 = [app_list[index]]
    v1 = [app_list[index]]
    v2 = [app1_list[index]]
    v3 = [app1_list[index]]
    return fixlen_data + v0 + v1 + v2 + v3

random_seed = 2019
tr_index = choose
X_train = all_data[tr_index].reset_index(drop=True)
y = all_data[tr_index]['age_group'].reset_index(drop=True).astype(int)
X_test = all_data[~tr_index].reset_index(drop=True)
print(X_train.shape,X_test.shape)

from sklearn.metrics import f1_score,accuracy_score

cv_pred = []
test_pred = []
cv_score = []
cv_model = []
skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    print(index)
    model = xDeepFM(linear_feature_columns, dnn_feature_columns,embedding_size=8,task='multicalss') # xDeepFM DeepFM AFM NFM
    model.compile(RAdam(lr=0.01),'categorical_crossentropy',
                 metrics = ['accuracy',],)
    train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    train_x = make_data(X_train,train_index)
    train_y = make_label(train_y)
    test_x = make_data(X_train,test_index)
    test_y = make_label(test_y)
    test_data = make_data(all_data,range(2010000,2512500))
    history = model.fit(train_x,train_y,batch_size=512,epochs=1,verbose=1,validation_data=(test_x,test_y))
    cv_model.append(model)
    y_test = model.predict(test_data,batch_size=512)
    y_val = model.predict(test_x,batch_size=512)
    cv_score.append(accuracy_score(y.iloc[test_index],np.argmax(y_val,axis=1)))
    print(cv_score)
    cv_pred.append(y_val)
    test_pred.append(y_test)

cv_pred = np.zeros((X_train.shape[0],6))
test_pred = np.zeros((X_test.shape[0],6))
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    print(index)
    train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    train_x = make_data(X_train,train_index)
    train_y = make_label(train_y)
    test_x = make_data(X_train,test_index)
    test_y = make_label(test_y)
    test_data = make_data(all_data,range(2010000,2512500))
    y_val = cv_model[index].predict(test_x,batch_size=256,verbose=1)
    print(y_val.shape)
    cv_pred[test_index] = y_val
    test_pred += cv_model[index].predict(test_data,batch_size=256,verbose=1) / 5

oof_train = pd.DataFrame(cv_pred)
oof_train.columns = ['proba_{}'.format(i) for i in range(6)]
oof_train['uid'] = train['uid']

oof_test = pd.DataFrame(test_pred)
oof_test.columns = ['proba_{}'.format(i) for i in range(6)]
oof_test['uid'] = test['uid']

oof_train.to_hdf("xDeepFM_cv_6449.hdf","train")
oof_test.to_hdf("xDeepFM_cv_6449.hdf","test")
# 5855 5993