# 导入工具包

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import gc
from collections import Counter
import copy
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# 数据读取

In [3]:
train_file = '../input/data_format1/train_format1.csv'
test_file = '../input/data_format1/test_format1.csv'
user_info_file = '../input/data_format1/user_info_format1.csv'
user_log_file = '../input/data_format1/user_log_format1.csv'

train_data = reduce_mem_usage(pd.read_csv(train_file))
test_data = reduce_mem_usage(pd.read_csv(test_file))
user_info = reduce_mem_usage(pd.read_csv(user_info_file))
user_log = reduce_mem_usage(pd.read_csv(user_log_file))

Memory usage of dataframe is 5.97 MB
Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage of dataframe is 5.98 MB
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage of dataframe is 9.71 MB
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage of dataframe is 2933.33 MB
Memory usage after optimization is: 890.48 MB
Decreased by 69.6%


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int32
 1   merchant_id  260864 non-null  int16
 2   label        260864 non-null  int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      261477 non-null  int32  
 1   merchant_id  261477 non-null  int16  
 2   prob         0 non-null       float64
dtypes: float64(1), int16(1), int32(1)
memory usage: 3.5 MB


In [6]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int32  
 1   age_range  421953 non-null  float16
 2   gender     417734 non-null  float16
dtypes: float16(2), int32(1)
memory usage: 3.2 MB


In [7]:
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int32  
 1   item_id      int32  
 2   cat_id       int16  
 3   seller_id    int16  
 4   brand_id     float16
 5   time_stamp   int16  
 6   action_type  int8   
dtypes: float16(1), int16(3), int32(2), int8(1)
memory usage: 890.5 MB


# 数据处理

In [8]:
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info, on=['user_id'], how='left')

del train_data, test_data, user_info
gc.collect()

0

In [9]:
# 按时间排序
user_log = user_log.sort_values(['user_id', 'time_stamp'])
gc.collect()

20

In [10]:
# 对每个用户逐个合并所有字段
list_join_func = lambda x: ' '.join([str(i) for i in x])

agg_dict = {
    'item_id': list_join_func,
    'cat_id': list_join_func,
    'seller_id': list_join_func,
    'brand_id': list_join_func,
    'time_stamp': list_join_func,
    'action_type': list_join_func
}

rename_dict = {
    'item_id': 'item_path',
    'cat_id': 'cat_path',
    'seller_id': 'seller_path',
    'brand_id': 'brand_path',
    'time_stamp': 'time_stamp_path',
    'action_type': 'action_type_path'
}


def merge_list(df_ID, join_columns, df_data, agg_dict, rename_dict):
    df_data = df_data.groupby(join_columns).agg(agg_dict).reset_index().rename(columns=rename_dict)
    
    df_ID = df_ID.merge(df_data, on=join_columns, how='left')
    return df_ID

all_data = merge_list(all_data, 'user_id', user_log, agg_dict, rename_dict)

del user_log
gc.collect()

0

# 定义特征统计函数
## 定义统计函数

1. 定义统计数据总数的函数

In [11]:
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

2. 定义统计数据唯一值总数的函数

In [12]:
def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1

3. 定义统计数据最大值的函数

In [13]:
def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        return -1

4. 定义统计数据最小值的函数

In [14]:
def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        return -1

5. 定义统计数据标准差的函数

In [15]:
def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1

6. 定义统计数据中频次为$topN$数据的元素的函数

In [16]:
def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1

7. 定义统计数据中频次为$topN$数据的元素的频次的函数

In [17]:
def most_n_cnt(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1

## 调用定义的统计函数

In [18]:
def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data


def user_nunique(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data


def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data


def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data


def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data


def user_most_n(df_data, single_col, name, n=1):
    df_data[name] = df_data[single_col].apply(lambda x: most_n(x, n))
    return df_data


def user_most_n_cnt(df_data, single_col, name, n=1):
    df_data[name] = df_data[single_col].apply(lambda x: most_n_cnt(x, n))
    return df_data

# 提取统计特征
## 特征统计

1. 店铺特征统计：统计与店铺特点相关的特征，如店铺、商品、品牌等

In [19]:
"""
基本统计特征
"""
all_data_test = all_data.head(2000)

# 统计用户点击、浏览、加购、购买行为
# 总次数
all_data_test = user_cnt(all_data_test, 'seller_path', 'user_cnt')

# 不同店铺个数
all_data_test = user_nunique(all_data_test, 'seller_path', 'seller_nunique')

# 不同品类个数
all_data_test = user_nunique(all_data_test, 'cat_path', 'cat_nunique')

# 不同品牌个数
all_data_test = user_nunique(all_data_test, 'brand_path', 'brand_nunique')

# 不同商品个数
all_data_test = user_nunique(all_data_test, 'item_path', 'item_nunique')

# 活跃天数
all_data_test = user_nunique(all_data_test, 'time_stamp_path', 'time_stamp_nunique')

# 不同用户行为种数
all_data_test = user_nunique(all_data_test, 'action_type_path', 'action_type_nunique')

# 最晚时间
all_data_test = user_max(all_data_test, 'time_stamp_path', 'time_stamp_max')

# 最早时间
all_data_test = user_min(all_data_test, 'time_stamp_path', 'time_stamp_min')

# 活跃天数方差
all_data_test = user_std(all_data_test, 'time_stamp_path', 'time_stamp_std')

# 最早与最晚相差天数
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']

# 用户最喜欢的店铺
all_data_test = user_most_n(all_data_test, 'seller_path', 'seller_most_1', n=1)

# 最喜欢的品类
all_data_test = user_most_n(all_data_test, 'cat_path', 'cat_most_1', n=1)

# 最喜欢的品牌
all_data_test = user_most_n(all_data_test, 'brand_path', 'brand_most_1', n=1)

# 最喜欢的商品
all_data_test = user_most_n(all_data_test, 'item_path', 'item_most_1', n=1)

# 最常见的行为动作
all_data_test = user_most_n(all_data_test, 'action_type_path', 'action_type_most_1', n=1)

# 用户最喜欢的店铺 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'seller_path', 'seller_most_1_cnt', n=1)

# 最喜欢的品类 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'cat_path', 'cat_most_1_cnt', n=1)

# 最喜欢的品牌 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'brand_path', 'brand_most_1_cnt', n=1)

# 最喜欢的商品 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'item_path', 'item_most_1_cnt', n=1)

# 最常见的行为动作 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'action_type_path', 'action_type_most_1_cnt', n=1)

2. 用户特征统计：对用户的点击、加购、购买、收藏等特征进行统计

In [20]:
# 对点击、加购、购买、收藏分开统计
def col_cnt_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']
        
        for col in col_list:
            data_dict[col] = df_data[col].split(' ')
        
        path_len = len(data_dict[col])  # 总行数
        
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)
        return len(data_out)
    except:
        return -1

    
def col_nunique_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']
        
        for col in col_list:
            data_dict[col] = df_data[col].split(' ')
        
        path_len = len(data_dict[col])  # 总行数
        
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)
        return len(set(data_out))
    except:
        return -1
    

def user_col_cnt(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
    return df_data


def user_col_nunique(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_nunique_(x, columns_list, action_type), axis=1)
    return df_data

3. 统计用户和店铺的关系：对店铺的用户点击次数、加购次数、购买次数、收藏次数等进行统计

In [21]:
# 点击次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '0', 'user_cnt_0')

# 加购次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '1', 'user_cnt_1')

# 购买次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '2', 'user_cnt_2')

# 收藏次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '3', 'user_cnt_3')

## 特征组合
1. 特征组合进行业务特征提取

In [22]:
# 对店铺+商品的点击次数
all_data_test = user_col_cnt(all_data_test, ['seller_path', 'item_path'], '0', 'user_item_cnt_0')

# 用户对店铺中多少种不同的商品做了点击
all_data_test = user_col_nunique(all_data_test, ['seller_path', 'item_path'], '0', 'seller_nunique_0')

In [23]:
all_data_test.columns

Index(['user_id', 'merchant_id', 'label', 'prob', 'age_range', 'gender',
       'item_path', 'cat_path', 'seller_path', 'brand_path', 'time_stamp_path',
       'action_type_path', 'user_cnt', 'seller_nunique', 'cat_nunique',
       'brand_nunique', 'item_nunique', 'time_stamp_nunique',
       'action_type_nunique', 'time_stamp_max', 'time_stamp_min',
       'time_stamp_std', 'time_stamp_range', 'seller_most_1', 'cat_most_1',
       'brand_most_1', 'item_most_1', 'action_type_most_1',
       'seller_most_1_cnt', 'cat_most_1_cnt', 'brand_most_1_cnt',
       'item_most_1_cnt', 'action_type_most_1_cnt', 'user_cnt_0', 'user_cnt_1',
       'user_cnt_2', 'user_cnt_3', 'user_item_cnt_0', 'seller_nunique_0'],
      dtype='object')

# 利用Countvector和TF-IDF提取特征

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from scipy import sparse

tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,
                           ngram_range=(1, 1),
                           max_features=100)
columns_list = ['seller_path']
for i, col in enumerate(columns_list):
    tfidfVec.fit(all_data_test[col])
    data_ = tfidfVec.transform(all_data_test[col])
    if i == 0:
        data_cat = data_
    else:
        data_cat = sparse.hstack((data_cat, data_))

In [25]:
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
df_tfidf.head()

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_90,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99
0,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,...,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
1,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,...,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
2,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,...,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
3,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,...,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
all_data_test = pd.concat([all_data_test, df_tfidf], axis=1)

# 嵌入特征

In [27]:
import gensim

def split_(x):
    try:
        return x.split(' ')
    except:
        return -1

model = gensim.models.Word2Vec(all_data_test['seller_path'].apply(lambda x: split_(x)),
                               size=100,
                               window=5,
                               min_count=5,
                               workers=4)

def mean_w2v_(x, model, size=100):
    try:
        i = 0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i += 1
                if i == 0:
                    vec = np.zeros(size)
                vec += model.wv[word]
        return vec / i
    except:
        return np.zeros(size)
    

def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v_(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embedding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embedding.columns = ['embedding_' + str(i) for i in df_embedding.columns]

In [28]:
all_data_test = pd.concat([all_data_test, df_embedding], axis=1)

# Stacking分类工具
## Stacking特征工具包

In [29]:
from sklearn.model_selection import KFold
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.naive_bayes import MultinomialNB, GaussianNB

## 定义Stacking分类特征相关函数

In [30]:
def stacking_clf(clf, train_x, train_y, test_x, clf_name, kf, label_split=None):
    train = np.zeros((train_x.shape[0], 1))
    test = np.zeros((test_x.shape[0], 1))
    test_pre = np.empty((folds, test_x.shape[0], 1))
    cv_scores = []
    
    for i, (train_index, test_index) in enumerate(kf.split(train_x, label_split)):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]
        
        if clf_name in ['rf', 'ada', 'gb', 'et', 'lr', 'knn',' gnb']:
            clf.fit(tr_x, tr_y)
            pre = clf.predict_proba(te_x)
            train[test_index] = pre[:, 0].reshape(-1, 1)
            test_pre[i, :] = clf.predict_proba(test_x)[:, 0].reshape(-1, 1)
            
            cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
            
        elif clf_name in ['xgb']:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {
                'booster': 'gbtree',
                'objective': 'multi:softprob',
                'eval_metric': 'mlogloss',
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.03,
                'tree_method': 'exact',
                'seed': 2017,
                'num_class': 2
            }
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'), (test_matrix, 'eval')]
            if test_matrix:
                model = clf.train(params,
                                  train_matrix,
                                  num_boost_round=num_round,
                                  evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds,
                                  verbose_eval=50)
                pre = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
                train[test_index] = pre[:, 0].reshape(-1, 1)
                z_pred = model.predict(z, ntree_limit=model.best_ntree_limit)
                print(z_pred[:10])
                print('z_pred.shape', z_pred.shape)
                test_pre[i, :] = z_pred.reshape(-1, 1)
                cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
                
        elif clf_name in ['lgb']:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                'boosting_type': 'gbdt',
                # 'boosting_type': 'dart',
                'objective': 'multiclass',
                'metric': 'multi_logloss',
                'min_child_weight': 1.5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'learning_rate': 0.03,
                'tree_method': 'exact',
                'seed': 2017,
                'num_class': 2,
                'silent': True
            }
            num_round=10000
            early_stopping_rounds=100
            if test_matrix:
                model = clf.train(params,
                                  train_matrix,
                                  num_round,
                                  valid_sets=test_matrix,
                                  early_stopping_rounds=early_stopping_rounds,
                                  verbose_eval=50)
                pre = model.predict(te_x, num_iteration=model.best_iteration)
                train[test_index] = pre[:, 0].reshape(-1, 1)
                z_pred = model.predict(test_x, ntree_limit=model.best_iteration)
                print(z_pred[:10])
                print('z_pred.shape', z_pred.shape)
                test_pre[i, :] = z_pred.reshape(-1, 1)
                cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        else:
            raise IOError('Please add clf.')
        print('%s now score is:' % clf_name, cv_scores)
    test[:] = test_pre.mean(axis=0)
    print('%s_score_list:' % clf_name, cv_scores)
    print('%s_score_mean:' % clf_name, np.mean(cv_scores))
    return train.reshape(-1, 1), test.reshape(-1, 1)


def rf_clf(x_train, y_train, x_valid, kf, label_split=None):
    randomforest = RandomForestClassifier(n_estimators=1200,
                                          max_depth=20,
                                          n_jobs=-1,
                                          random_state=2017,
                                          max_features='auto',
                                          verbose=1)
    rf_train, rf_test = stacking_clf(randomforest, x_train, y_train, x_valid, 'rf', kf, label_split=label_split)
    return rf_train, rf_test, 'rf'


def ada_clf(x_train, y_train, x_valid, kf, label_split=None):
    adaboost = AdaBoostClassifier(n_estimators=50,
                                  random_state=2017,
                                  learning_rate=0.01)
    ada_train, ada_test = stacking_clf(adaboost, x_train, y_train, x_valid, 'ada', kf, label_split=label_split)
    return rf_train, rf_test, 'ada'


def gb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gbdt = GradientBoostingClassifier(n_estimators=100,
                                          learning_rate=0.04,
                                          subsample=0.8,
                                          max_depth=5,
                                          verbose=1,
                                          random_state=2017)
    gbdt_train, gbdt_test = stacking_clf(gbdt, x_train, y_train, x_valid, 'gb', kf, label_split=label_split)
    return gbdt_train, rf_test, 'gb'


def et_clf(x_train, y_train, x_valid, kf, label_split=None):
    extratree = ExtraTreesRegressor(n_estimators=1200,
                                    max_depth=35,
                                    max_features='auto',
                                    n_jobs=-1,
                                    verbose=1,
                                    random_state=2017)
    et_train, et_test = stacking_clf(gbdt, x_train, y_train, x_valid, 'et', kf, label_split=label_split)
    return et_train, et_test, 'et'


def xgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(xgboost, x_train, y_train, x_valid, 'xgb', kf, label_split=label_split)
    return xgb_train, xgb_test, 'xgb'


def lgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    lgb_train, lgb_test = stacking_clf(lightgbm, x_train, y_train, x_valid, 'lgb', kf, label_split=label_split)
    return lgb_train, lgb_test, 'lgb'


def gnb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gnb = GaussianNB()
    gnb_train, gnb_test = stacking_clf(gnb, x_train, y_train, x_valid, 'gnb', kf, label_split=label_split)
    return gnb_train, gnb_test, 'gnb'


def lr_clf(x_train, y_train, x_valid, kf, label_split=None):
    lr = LogisticRegression(C=0.1,
                            max_iter=200,
                            n_jobs=-1,
                            random_state=2017)
    lr_train, lr_test = stacking_clf(lr, x_train, y_train, x_valid, 'lr', kf, label_split=label_split)
    return lr_train, lr_test, 'lr'


def knn_clf(x_train, y_train, x_valid, kf, label_split=None):
    knn = KNeighborsClassifier(n_neighbors=200, n_jobs=-1)
    knn_train, knn_test = stacking_clf(knn, x_train, y_train, x_valid, 'knn', kf, label_split=label_split)
    return knn_train, knn_test, 'knn'

## 读取训练数据和验证数据

In [31]:
features_columns = [c for c in all_data_test.columns if c not in ['label', 'prob', 'seller_path', 'cat_path',
                                                                  'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']]
x_train = all_data_test[all_data_test['label'].notnull()][features_columns].values
y_train = all_data_test[all_data_test['label'].notnull()]['label'].values
x_valid = all_data_test[all_data_test['label'].isna()][features_columns].values

In [32]:
all_data_test['label'].unique()

array([0., 1.])

In [33]:
x_train[:10]

array([[34176, 3906, 6.0, ..., 0.0, 0.0, 0.0],
       [34176, 121, 6.0, ..., 0.0, 0.0, 0.0],
       [34176, 4356, 6.0, ..., 0.0, 0.0, 0.0],
       ...,
       [231552, 3828, 5.0, ..., 0.0, 0.0, 0.0],
       [231552, 2124, 5.0, ..., 0.0, 0.0, 0.0],
       [232320, 1168, 4.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [34]:
x_valid[:10]

array([], shape=(0, 231), dtype=object)

In [35]:
def get_matrix(data):
    where_are_nan = np.isnan(data)
    where_are_inf = np.isinf(data)
    data[where_are_nan] = 0
    data[where_are_inf] = 0
    return data


x_train = np.float_(get_matrix(np.float_(x_train)))
y_train = np.int_(y_train)
x_valid = np.float_(get_matrix(np.float_(x_valid)))

## 使用lgb和xgb构造Stacking特征

In [36]:
folds=5
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [37]:
clf_list = [lgb_clf, xgb_clf]
clf_list_col = ['lgb_clf', 'xgb_clf']

In [38]:
column_list = []
train_data_list = []
test_data_list = []
for clf in clf_list:
    train_data, test_data, clf_name = clf(x_train, y_train, x_valid, kf, label_split=None)
    train_data_list.append(train_data)
    test_data_list.append(test_data)

train_stacking = np.concatenate(train_data_list, axis=1)
test_stacking = np.concatenate(test_data_list, axis=1)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.241901
[100]	valid_0's multi_logloss: 0.247853
Early stopping, best iteration is:
[31]	valid_0's multi_logloss: 0.239734
[]
z_pred.shape (0,)
lgb now score is: [2.6559997282941747]
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.297223
[100]	valid_0's multi_logloss: 0.317045
Early stopping, best iteration is:
[1]	valid_0's multi_logloss: 0.281976
[]
z_pred.shape (0,)
lgb now score is: [2.6559997282941747, 2.586516769469715]
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.265978
[100]	valid_0's multi_logloss: 0.277071
Early stopping, best iteration is:
[1]	valid_0's multi_logloss: 0.25415
[]
z_pred.shape (0,)
lgb now score is: [2.6559997282941747, 2.586516769469715, 2.5786889164949485]
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.204049
[100]	valid_0's mul

## 原始特征和Stacking特征合并

In [39]:
train = np.concatenate([x_train, train_stacking], axis=1)
test = np.concatenate([x_valid, test_stacking], axis=1)

df_train_all = pd.DataFrame(train)
df_test_all = pd.DataFrame(test)
df_train_all.columns = features_columns + clf_list_col
df_test_all.columns = features_columns + clf_list_col

In [40]:
df_train_all['user_id'] = all_data_test[all_data_test['label'].notnull()]['user_id']
df_test_all['user_id'] = all_data_test[all_data_test['label'].isnull()]['user_id']
df_train_all['label'] = all_data_test[all_data_test['label'].notnull()]['label']

## 保存特征

In [41]:
df_train_all.to_csv('../input/train_all.csv', index=False)
df_test_all.to_csv('../input/test_all.csv', index=False)