In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import lightgbm as lgb
import gc
import re
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, f1_score
from time import ctime
import os, sys
import json
import jieba

# 数据路径
import path_file

In [None]:
def loads(item):
    try:
        return json.loads(item)
    except (json.JSONDecodeError, TypeError):
        return json.loads("{}")

def min_edit(str1, str2):
    str1 = str1.lower()
    str2 = str2.lower()
    """计算两句子的最小编辑距离"""
    matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
    for i in range(1, len(str1) + 1):
        for j in range(1, len(str2) + 1):
            if str1[i - 1] == str2[j - 1]:
                d = 0
            else:
                d = 1
            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d)
    return matrix[len(str1)][len(str2)]

def get_prefix_loc_in_title(prefix,title):
    """计算查询词prefix出现在title中的那个位置，前、后、中、没出现"""
    prefix = prefix.lower()
    title = title.lower()
    if prefix not in title:
        return -1
    lens = len(prefix)
    if prefix == title[:lens]:
        return 0
    elif prefix == title[-lens:]:
        return 1
    else:
        return 2

def get_rp_prefix_in_title(prefix, title, mode='char'):
    """计算title对prefix的词、字级别的召回率、精确率"""
    prefix = prefix.lower()
    title = title.lower()
    if mode == 'char':
        prefix = list(prefix)
        title = list(title)
    else:
        prefix = list(jieba.cut(prefix))
        title = list(jieba.cut(title))
    len_title = len(title)
    len_prefix = len(prefix)
    len_comm_xx = len(set(prefix) & set(title))

    recall = len_comm_xx / (len_prefix + 0.01)
    precision = len_comm_xx / (len_title + 0.01)
    acc = len_comm_xx / (len_title + len_prefix - len_comm_xx)
    return [recall, precision, acc]

def get_ngram_rp_prefix_in_title(prefix, title, mode='char'):
    """计算title对prefix的词、字级别的召回率、精确率（1-2gram）"""
    prefix = prefix.lower()
    title = title.lower()
    if mode == 'char':
        prefix = list(prefix)
        title = list(title)
    else:
        prefix = list(jieba.cut(prefix))
        title = list(jieba.cut(title))
    prefix_2gram = []
    for i in range(len(prefix) - 1):
        prefix_2gram.append(prefix[i] + prefix[i + 1])
    prefix.extend(prefix_2gram)

    title_2gram = []
    for i in range(len(title) - 1):
        title_2gram.append(title[i] + title[i + 1])
    title.extend(title_2gram)

    len_title = len(title)
    len_prefix = len(prefix)
    len_comm_xx = len(set(prefix) & set(title))

    recall = len_comm_xx / (len_prefix + 0.01)
    precision = len_comm_xx / (len_title + 0.01)
    acc = len_comm_xx / (len_title + len_prefix - len_comm_xx)
    return [recall, precision, acc]

In [None]:
train_txt = path_file.train_txt
test_txt = path_file.test_txt
print(test_txt)
val_txt = path_file.val_txt
query_prediction_feature_txt = path_file.query_prediction_feature_txt
title_prediction_jaccard_distance_txt = path_file.title_prediction_jaccard_distance_new_txt
title_prediction_distance_txt = path_file.title_prediction_distance_new_txt
title_tag_word2vec_distance_txt = path_file.title_tag_word2vec_distance_new_txt
model_path = path_file.model_path
# nlp_feat_txt = path_file.nlp_feat_txt
nlp_feat_txt = path_file.nlp_feat_txt
prefix_title_tfidf_txt = path_file.prefix_title_tfidf_txt

test_query_prediction_feature_txt = '../DataSets/feat_data/test_B/test_query_prediction_feature4.txt'
test_title_prediction_jaccard_distance_txt = '../DataSets/feat_data/test_B/test_title_prediction_jaccard_distance_new.txt'
test_title_prediction_distance_txt = '../DataSets/feat_data/test_B/test_word_title_prediction_distance_new.txt'
test_title_tag_word2vec_distance_txt = '../DataSets/feat_data/test_B/test_word_title_tag_word2vec_distance_new_new.txt'
test_nlp_feat_txt = '../DataSets/feat_data/test_B/nlp_feat_test_B.txt'
test_prefix_title_tfidf_txt = '../DataSets/feat_data/test_B/prefix_title_tfidf_feat_all_new.txt'

In [None]:
train_data = pd.read_table(train_txt,
                           names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, encoding='utf-8',
                           quoting=3).astype(str)
val_data = pd.read_table(val_txt,
                         names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, encoding='utf-8',
                         quoting=3).astype(str)
test_data = pd.read_table(test_txt,
                          names=['prefix', 'query_prediction', 'title', 'tag'], header=None, encoding='utf-8',
                          quoting=3).astype(str)
# 去噪，只有一条数据
train_data = train_data[train_data['label'].isin(['0', '1'])]

# 统一赋值test_data的label
test_data['label'] = -1

len_train = train_data.shape[0]
len_val = val_data.shape[0]
len_test = test_data.shape[0]
print("len_train", len_train)
print("len_val", len_val)
print("len_test", len_test)

# 连接在一起便于统一处理
data = pd.concat([train_data, val_data, test_data], ignore_index=True)
data['label'] = data['label'].apply(lambda x: int(x))
#将没有query_prediction的条目填充为"{}"
data['query_prediction'].replace('nan','{}',inplace=True)
data['prefix'] = data['prefix'].apply(lambda x: x.lower())
data['title'] = data['title'].apply(lambda x: x.lower())
print('data shape : ', data.shape)

train_data = data[:-(len_val + len_test)]
print(train_data.shape)
prefixs = train_data.groupby(['prefix'], as_index=False)['prefix'].agg({'cnt': 'count'})['prefix'].tolist()
print('prefixs : ', len(prefixs))
titles = train_data.groupby(['title'], as_index=False)['title'].agg({'cnt': 'count'})['title'].tolist()
print('titles : ', len(titles))

In [None]:
print(data['label'].dtype)

In [None]:
print(data[data['prefix'] == '抖'].shape)
print(data[data['prefix'] == '淘'].shape)
print(data[data['prefix'] == '酷狗'].shape)
print(data[data['prefix'] == '西瓜'].shape)
print(data[data['prefix'] == '万能'].shape)
print(data[data['prefix'] == '瓜子二手车'].shape)

data['prefix'] = data['prefix'].apply(lambda x: '抖音' if x == '抖' else x)
data['prefix'] = data['prefix'].apply(lambda x: '淘宝' if x == '淘' else x)
data['prefix'] = data['prefix'].apply(lambda x: '酷狗音乐' if x == '酷狗' else x)
data['prefix'] = data['prefix'].apply(lambda x: '西瓜小' if x == '西瓜' else x)
data['prefix'] = data['prefix'].apply(lambda x: 'wifi万能钥匙' if x == '万能' else x)
data['prefix'] = data['prefix'].apply(lambda x: '瓜子' if x == '瓜子二手车' else x)

print(data[data['prefix'] == '抖'].shape)
print(data[data['prefix'] == '淘'].shape)
print(data[data['prefix'] == '酷狗'].shape)
print(data[data['prefix'] == '西瓜'].shape)
print(data[data['prefix'] == '万能'].shape)
print(data[data['prefix'] == '瓜子二手车'].shape)

In [None]:
print(data['label'].dtype)

In [None]:
val_data = data[-(len_val + len_test):-len_test]
test_data = data[-len_test:]
print(val_data.shape)
print(test_data.shape)

print(val_data[val_data['prefix'].isin(prefixs)].shape)
print(test_data[test_data['prefix'].isin(prefixs)].shape)

In [None]:
#####################################################
# basic feat
#####################################################

###title feat##########
print ("gen_basic_feat")
data['title_len'] = data['title'].apply(lambda x: len(str(x)))
data['title_digit_len'] = data['title'].apply(lambda x: sum(c.isdigit() for c in x))
data['title_alpha_len'] = data['title'].apply(lambda x: len(re.findall('[a-zA-Z]', x)))
data['title_zh_len'] = data['title'].apply(lambda x: len(re.findall(u'([\u4e00-\u9fff])', x)))

data['title_digit_rate'] = data['title_digit_len'] * 1.0 / data['title_len'] * 1.0
data['title_digit_sub'] = data['title_len'] - data['title_digit_len']
data['title_alpha_rate'] = data['title_alpha_len'] * 1.0 / data['title_len'] * 1.0
data['title_alpha_sub'] = data['title_len'] - data['title_alpha_len']

# query_prediction
data['prediction_is_null'] = data['query_prediction'].apply(lambda x: 1 if x == '{}' else 0)
data["query_prediction"] = data["query_prediction"].apply(loads)
data['prediction_num'] = data['query_prediction'].map(lambda x: len(x.keys()))

####prefix feat#########
data['prefix_len'] = data['prefix'].apply(lambda x: len(str(x)))
data['prefix_digit_len'] = data['prefix'].apply(lambda x: sum(c.isdigit() for c in x))
data['prefix_alpha_len'] = data['prefix'].apply(lambda x: len(re.findall('[a-zA-Z]', x)))
data['prefix_zh_len'] = data['prefix'].apply(lambda x: len(re.findall(u'([\u4e00-\u9fff])', x)))

data['prefix_digit_rate'] = data['prefix_digit_len'] * 1.0 / data['prefix_len'] * 1.0
data['prefix_digit_sub'] = data['prefix_len'] - data['prefix_digit_len']
data['prefix_alpha_rate'] = data['prefix_alpha_len'] * 1.0 / data['prefix_len'] * 1.0
data['prefix_alpha_sub'] = data['prefix_len'] - data['prefix_alpha_len']

######title_prefix feat##########
# prefix_title_tfidf_feat = pd.read_csv(prefix_title_tfidf_txt, sep='\t')[:-len_test]
# # prefix_title_tfidf_feat = prefix_title_tfidf_feat
# prefix_title_tfidf_feat_test = pd.read_csv(test_prefix_title_tfidf_txt, sep='\t')
# print('prefix_title_tfidf_feat_test:' , prefix_title_tfidf_feat_test.shape)
# prefix_title_tfidf_feat = pd.concat([prefix_title_tfidf_feat,prefix_title_tfidf_feat_test], ignore_index=True)
# print('prefix_title_tfidf_feat:' , prefix_title_tfidf_feat.shape)
prefix_title_tfidf_feat = pd.read_csv(test_prefix_title_tfidf_txt, sep='\t')
print('prefix_title_tfidf_feat:' , prefix_title_tfidf_feat.shape)
data = pd.concat([data, prefix_title_tfidf_feat], axis=1)
del prefix_title_tfidf_feat
# del prefix_title_tfidf_feat_test
gc.collect()
print('data shape : ', data.shape)

data['prefix_is_title'] = data.apply(lambda x: 1 if x['prefix'].lower() == x['title'].lower() else 0, axis=1)
# data['prefix_is_title'] = data.apply(lambda x: 1 if x['prefix'] == x['title'] else 0, axis=1)
data['common_words_cnt'] = data.apply(
    lambda x: len(set(list(str(x['prefix']).lower())).intersection(set(list(str(x['title']).lower())))), axis=1)
# data['common_words_cnt'] = data.apply(
#     lambda x: len(set(list(str(x['prefix']))).intersection(set(list(str(x['title']))))), axis=1)
data['common_words_cnt_rate'] = data['common_words_cnt'] / data['title_len']
data['prefix_len_rate'] = data['prefix_len'] / data['title_len']
data['prefix_len_sub'] = data['prefix_len'] - data['title_len']
data['prefix_is_titles_prefix'] = data.apply(lambda x: 1 if str(x['title']).lower().startswith(str(x['prefix']).lower()) else 0, axis=1)
# data['prefix_is_titles_prefix'] = data.apply(lambda x: 1 if str(x['title']).startswith(str(x['prefix'])) else 0, axis=1)
data['min_edit'] = data.apply(lambda x: min_edit(x['prefix'],x['title']), axis=1)
data['prefix_loc'] = data.apply(lambda x: get_prefix_loc_in_title(x['prefix'],x['title']), axis=1)

char_level_prefix = data.apply(lambda x: get_rp_prefix_in_title(x['prefix'],x['title'],mode='char'), axis=1)
char_level_prefix = [kk for kk in char_level_prefix]
char_level_prefix = np.array(char_level_prefix)
data['prefix_t_recall_char'] = char_level_prefix[:,0].tolist()
data['prefix_t_precision_char'] = char_level_prefix[:,1].tolist()
data['prefix_t_acc_char'] = char_level_prefix[:,2].tolist()

word_level_prefix = data.apply(lambda x: get_rp_prefix_in_title(x['prefix'], x['title'], mode='word'), axis=1)
word_level_prefix = [kk for kk in word_level_prefix]
word_level_prefix = np.array(word_level_prefix)
data['prefix_t_recall_word'] = word_level_prefix[:, 0].tolist()
data['prefix_t_precision_word'] = word_level_prefix[:, 1].tolist()
data['prefix_t_acc_word'] = word_level_prefix[:, 2].tolist()

char_ngram_level_prefix = data.apply(lambda x: get_ngram_rp_prefix_in_title(x['prefix'], x['title'], mode='char'), axis=1)
char_ngram_level_prefix = [kk for kk in char_ngram_level_prefix]
char_ngram_level_prefix = np.array(char_ngram_level_prefix)
data['prefix_t_recall_char_ngram'] = char_ngram_level_prefix[:, 0].tolist()
data['prefix_t_precision_char_ngram'] = char_ngram_level_prefix[:, 1].tolist()
data['prefix_t_acc_char_ngram'] = char_ngram_level_prefix[:, 2].tolist()

word_ngram_level_prefix = data.apply(lambda x: get_ngram_rp_prefix_in_title(x['prefix'], x['title'], mode='word'), axis=1)
word_ngram_level_prefix = [kk for kk in word_ngram_level_prefix]
word_ngram_level_prefix = np.array(word_ngram_level_prefix)
data['prefix_t_recall_word_ngram'] = word_ngram_level_prefix[:, 0].tolist()
data['prefix_t_precision_word_ngram'] = word_ngram_level_prefix[:, 1].tolist()
data['prefix_t_acc_word_ngram'] = word_ngram_level_prefix[:, 2].tolist()

######query predict feat#######

##需要先跑其他两个.py文件（gen_query_prediction_feat.py 和 gen_query_prediction_feat2.py），才会生成以下两个.txt文件
query_prediction_feature = pd.read_csv(query_prediction_feature_txt, sep='\t')[:-len_test]
# query_prediction_feature = query_prediction_feature[:-len_test]
print('query_prediction_feature:' , query_prediction_feature.shape)
query_prediction_feature_test = pd.read_csv(test_query_prediction_feature_txt, sep='\t')
print('query_prediction_feature_test:' , query_prediction_feature_test.shape)
query_prediction_feature = pd.concat([query_prediction_feature,query_prediction_feature_test], ignore_index=True)
print('query_prediction_feature:' , query_prediction_feature.shape)

data = pd.concat([data, query_prediction_feature], axis=1)
del query_prediction_feature
del query_prediction_feature_test
gc.collect()
print('data shape : ', data.shape)
#######word2vec distance#######
####nlp 相关特征，暂时没有怎么做，以上都是统计特征
####这部分特征，就是计算title prefix predict_query的word embedding的距离，
####分词用的是thulac，word2vec用的是前几天腾讯开源的中文embedding，地址https://ai.tencent.com/ailab/nlp/embedding.html
####在这个baseline的性能上没有提升多少，不到1个点，可以暂时忽略，跑之前也需要生成以下三个txt文件##
nlp_feat = pd.read_csv(nlp_feat_txt, sep='\t')[:-len_test]
# nlp_feat = nlp_feat[:-len_test]
print('nlp_feat:' , nlp_feat.shape)
nlp_feat_test = pd.read_csv(test_nlp_feat_txt, sep='\t')
print('nlp_feat_test:' , nlp_feat_test.shape)
nlp_feat = pd.concat([nlp_feat,nlp_feat_test], ignore_index=True)
print('nlp_feat:' , nlp_feat.shape)

data = pd.concat([data, nlp_feat], axis=1)
del nlp_feat
del nlp_feat_test
gc.collect()
print('data shape : ', data.shape)

title_prediction_jaccard_distance = pd.read_csv(title_prediction_jaccard_distance_txt, sep='\t')[:-len_test]
# title_prediction_jaccard_distance = title_prediction_jaccard_distance[:-len_test]
print('title_prediction_jaccard_distance:' , title_prediction_jaccard_distance.shape)
title_prediction_jaccard_distance_test = pd.read_csv(test_title_prediction_jaccard_distance_txt, sep='\t')
print('title_prediction_jaccard_distance_test:' , title_prediction_jaccard_distance_test.shape)
title_prediction_jaccard_distance = pd.concat([title_prediction_jaccard_distance,title_prediction_jaccard_distance_test], ignore_index=True)
print('title_prediction_jaccard_distance:' , title_prediction_jaccard_distance.shape)

data = pd.concat([data, title_prediction_jaccard_distance], axis=1)
del title_prediction_jaccard_distance
del title_prediction_jaccard_distance_test
gc.collect()
print('data shape : ', data.shape)

title_prediction_distance = pd.read_csv(title_prediction_distance_txt, sep='\t')[:-len_test]
# title_prediction_distance = title_prediction_distance[:-len_test]
print('title_prediction_distance:' , title_prediction_distance.shape)
title_prediction_distance_test = pd.read_csv(test_title_prediction_distance_txt, sep='\t')
print('title_prediction_distance_test:' , title_prediction_distance_test.shape)
title_prediction_distance = pd.concat([title_prediction_distance,title_prediction_distance_test], ignore_index=True)
print('title_prediction_distance:' , title_prediction_distance.shape)

data = pd.concat([data, title_prediction_distance], axis=1)
del title_prediction_distance
del title_prediction_distance_test
gc.collect()
print('data shape : ', data.shape)

title_tag_word2vec_distance = pd.read_csv(title_tag_word2vec_distance_txt, sep='\t')[:-len_test]
# title_tag_word2vec_distance = title_tag_word2vec_distance[:-len_test]
print('title_tag_word2vec_distance:' , title_tag_word2vec_distance.shape)
title_tag_word2vec_distance_test = pd.read_csv(test_title_tag_word2vec_distance_txt, sep='\t')
print('title_tag_word2vec_distance_test:' , title_tag_word2vec_distance_test.shape)
title_tag_word2vec_distance = pd.concat([title_tag_word2vec_distance,title_tag_word2vec_distance_test], ignore_index=True)
print('title_tag_word2vec_distance:' , title_tag_word2vec_distance.shape)

data = pd.concat([data, title_tag_word2vec_distance], axis=1)
del title_tag_word2vec_distance
del title_tag_word2vec_distance_test
gc.collect()
print('data shape : ', data.shape)

# print ("title_prediction_distance bigram")
# title_prediction_distance = pd.read_csv(title_prediction_bigram_distance_txt, sep='\t')
# data = pd.concat([data, title_prediction_distance[:-len_test]], axis=1)
# del title_prediction_distance
# gc.collect()

# print ("title_tag_word2vec_distance bigram")
# title_tag_word2vec_distance = pd.read_csv(title_tag_word2vec_distance_bigram_txt, sep='\t')
# data = pd.concat([data, title_tag_word2vec_distance[:-len_test]], axis=1)
# del title_tag_word2vec_distance
# gc.collect()

#######CTR feat##########

#####感觉 CTR特征很容易过拟合，尤其是数据量少的时候，所以再跑的时候没有用######

# 打乱数据并分成num_folds份
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

del train_data
del val_data
del test_data
gc.collect()

train_data = data[:-(len_val + len_test)]
val_data_ctr = data[-(len_val + len_test):]
del data
gc.collect()

train_data = shuffle(train_data, random_state=103)

num_folds = 5
step = int(len_train / num_folds) + 1
data_slices = []

from sklearn.preprocessing import MinMaxScaler

items = ['prefix', 'title', 'tag']
len_items = ['prefix_len', 'prediction_num']
# def get_ctr_feat(feat_data,tmp_data):
#     for item in items:
#         temp = feat_data.groupby(item, as_index=False)['label'].agg({item + '_click': 'sum', item + '_count': 'count'})
#         temp[item + '_ctr'] = temp[item + '_click'] / (temp[item + '_count'])
#         tmp_data = pd.merge(tmp_data, temp, on=item, how='left')
#         tmp_data = tmp_data.fillna(0)
#         mm = MinMaxScaler()
#         tmp_data[[item + '_click',item + '_count']] = mm.fit_transform(tmp_data[[item + '_click',item + '_count']])
#
#     print('2 cross')
#     for i in range(len(items)):
#         for j in range(i + 1, len(items)):
#             item_g = [items[i], items[j]]
#             temp = feat_data.groupby(item_g, as_index=False)['label'].agg(
#                 {'_'.join(item_g) + '_click': 'sum', '_'.join(item_g) + '_count': 'count'})
#             temp['_'.join(item_g) + '_ctr'] = temp['_'.join(item_g) + '_click'] / (
#                 temp['_'.join(item_g) + '_count'] + 3)
#             tmp_data = pd.merge(tmp_data, temp, on=item, how='left')
#
#             tmp_data = tmp_data.fillna(0)
#             mm = MinMaxScaler()
#             tmp_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']] = mm.fit_transform(
#                 tmp_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']])
#     return tmp_data
#
# folds = KFold(len_train_val,n_folds = num_folds,shuffle = True,random_state=42)
# for curr_fold, (idx_train,idx_val) in enumerate(folds):
#     print ("curr fold : ",curr_fold)
#     feat_data = train_val_data.loc[idx_train].copy()
#     tmp_data = train_val_data.loc[idx_val].copy()
#     data_slices.append(get_ctr_feat(feat_data,tmp_data))

for i in range(0, len_train, step):
    ith_data = train_data[i:i + step]
    print (ith_data.shape[0])
    rest_data = pd.concat([train_data[:i], train_data[i + step:]])

    for item in len_items:
        temp = rest_data.groupby(item, as_index=False)['label'].agg({item + '_click': 'sum', item + '_count': 'count'})
        temp[item + '_ctr'] = temp[item + '_click'] / (temp[item + '_count'])
        ith_data = pd.merge(ith_data, temp, on=item, how='left')
        ith_data = ith_data.fillna(0)
        mm = MinMaxScaler()
        ith_data[[item + '_click', item + '_count']] = mm.fit_transform(ith_data[[item + '_click', item + '_count']])

        item_g = [item, 'tag']
        temp = rest_data.groupby(item_g, as_index=False)['label'].agg(
            {'_'.join(item_g) + '_click': 'sum', '_'.join(item_g) + '_count': 'count'})
        temp['_'.join(item_g) + '_ctr'] = temp['_'.join(item_g) + '_click'] / (
            temp['_'.join(item_g) + '_count'] + 3)
        ith_data = pd.merge(ith_data, temp, on=item_g, how='left')
        ith_data = ith_data.fillna(0)
        mm = MinMaxScaler()
        ith_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']] = mm.fit_transform(
            ith_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']])

    for item in items:
        temp = rest_data.groupby(item, as_index=False)['label'].agg({item + '_click': 'sum', item + '_count': 'count'})
        temp[item + '_ctr'] = temp[item + '_click'] / (temp[item + '_count'])
        ith_data = pd.merge(ith_data, temp, on=item, how='left')
        ith_data = ith_data.fillna(0)
        mm = MinMaxScaler()
        ith_data[[item + '_click', item + '_count']] = mm.fit_transform(ith_data[[item + '_click', item + '_count']])
    print('2 cross')
    for i in range(len(items)):
        for j in range(i + 1, len(items)):
            item_g = [items[i], items[j]]
            temp = rest_data.groupby(item_g, as_index=False)['label'].agg(
                {'_'.join(item_g) + '_click': 'sum', '_'.join(item_g) + '_count': 'count'})
            temp['_'.join(item_g) + '_ctr'] = temp['_'.join(item_g) + '_click'] / (
                temp['_'.join(item_g) + '_count'] + 3)
            ith_data = pd.merge(ith_data, temp, on=item_g, how='left')
            ith_data = ith_data.fillna(0)
            mm = MinMaxScaler()
            ith_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']] = mm.fit_transform(
                ith_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']])
    print('3 cross')
    item_g = ['prefix', 'title', 'tag']
    temp = rest_data.groupby(item_g, as_index=False)['label'].agg(
        {'_'.join(item_g) + '_click': 'sum', '_'.join(item_g) + '_count': 'count'})
    temp['_'.join(item_g) + '_ctr'] = temp['_'.join(item_g) + '_click'] / (
        temp['_'.join(item_g) + '_count'] + 3)
    ith_data = pd.merge(ith_data, temp, on=item_g, how='left')
    ith_data = ith_data.fillna(0)
    mm = MinMaxScaler()
    ith_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']] = mm.fit_transform(
        ith_data[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']])
    data_slices.append(ith_data)

train_data_ctr = pd.concat(data_slices, ignore_index=True)
del data_slices
gc.collect()
del ith_data
del rest_data
gc.collect()

for item in len_items:
    print(item)
    temp = train_data.groupby(item, as_index=False)['label'].agg({item + '_click': 'sum', item + '_count': 'count'})
    temp[item + '_ctr'] = temp[item + '_click'] / (temp[item + '_count'])
    temp[item + '_click'] = temp[item + '_click'] / num_folds * (num_folds - 1)
    temp[item + '_count'] = temp[item + '_count'] / num_folds * (num_folds - 1)
    val_data_ctr = pd.merge(val_data_ctr, temp, on=item, how='left')
    val_data_ctr = val_data_ctr.fillna(0)
    mm = MinMaxScaler()
    val_data_ctr[[item + '_click', item + '_count']] = mm.fit_transform(
        val_data_ctr[[item + '_click', item + '_count']])
    item_g = [item, 'tag']
    temp = train_data.groupby(item_g, as_index=False)['label'].agg(
        {'_'.join(item_g) + '_click': 'sum', '_'.join(item_g) + '_count': 'count'})
    temp['_'.join(item_g) + '_ctr'] = temp['_'.join(item_g) + '_click'] / (temp['_'.join(item_g) + '_count'] + 3)
    temp['_'.join(item_g) + '_click'] = temp['_'.join(item_g) + '_click'] / num_folds * (num_folds - 1)
    temp['_'.join(item_g) + '_count'] = temp['_'.join(item_g) + '_count'] / num_folds * (num_folds - 1)
    val_data_ctr = pd.merge(val_data_ctr, temp, on=item_g, how='left')

    val_data_ctr = val_data_ctr.fillna(0)
    mm = MinMaxScaler()
    val_data_ctr[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']] = mm.fit_transform(
        val_data_ctr[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']])

for item in items:
    print(item)
    temp = train_data.groupby(item, as_index=False)['label'].agg({item + '_click': 'sum', item + '_count': 'count'})
    temp[item + '_ctr'] = temp[item + '_click'] / (temp[item + '_count'])
    temp[item + '_click'] = temp[item + '_click'] / num_folds * (num_folds - 1)
    temp[item + '_count'] = temp[item + '_count'] / num_folds * (num_folds - 1)
    val_data_ctr = pd.merge(val_data_ctr, temp, on=item, how='left')
    val_data_ctr = val_data_ctr.fillna(0)
    mm = MinMaxScaler()
    val_data_ctr[[item + '_click', item + '_count']] = mm.fit_transform(
        val_data_ctr[[item + '_click', item + '_count']])
print('2 cross')
for i in range(len(items)):
    for j in range(i + 1, len(items)):
        print(items[i], ' ', items[j])
        item_g = [items[i], items[j]]
        temp = train_data.groupby(item_g, as_index=False)['label'].agg(
            {'_'.join(item_g) + '_click': 'sum', '_'.join(item_g) + '_count': 'count'})
        temp['_'.join(item_g) + '_ctr'] = temp['_'.join(item_g) + '_click'] / (temp['_'.join(item_g) + '_count'] + 3)
        temp['_'.join(item_g) + '_click'] = temp['_'.join(item_g) + '_click'] / num_folds * (num_folds - 1)
        temp['_'.join(item_g) + '_count'] = temp['_'.join(item_g) + '_count'] / num_folds * (num_folds - 1)
        val_data_ctr = pd.merge(val_data_ctr, temp, on=item_g, how='left')

        val_data_ctr = val_data_ctr.fillna(0)
        mm = MinMaxScaler()
        val_data_ctr[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']] = mm.fit_transform(
            val_data_ctr[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']])

print('3 cross')
item_g = ['prefix', 'title', 'tag']
temp = train_data.groupby(item_g, as_index=False)['label'].agg(
    {'_'.join(item_g) + '_click': 'sum', '_'.join(item_g) + '_count': 'count'})
temp['_'.join(item_g) + '_ctr'] = temp['_'.join(item_g) + '_click'] / (temp['_'.join(item_g) + '_count'] + 3)
temp['_'.join(item_g) + '_click'] = temp['_'.join(item_g) + '_click'] / num_folds * (num_folds - 1)
temp['_'.join(item_g) + '_count'] = temp['_'.join(item_g) + '_count'] / num_folds * (num_folds - 1)
val_data_ctr = pd.merge(val_data_ctr, temp, on=item_g, how='left')
val_data_ctr = val_data_ctr.fillna(0)
mm = MinMaxScaler()
val_data_ctr[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']] = mm.fit_transform(
    val_data_ctr[['_'.join(item_g) + '_click', '_'.join(item_g) + '_count']])

data = pd.concat([train_data_ctr, val_data_ctr])
del train_data
del train_data_ctr
del val_data_ctr
del temp
gc.collect()

def encode_count(df, column_name, new_column_name=''):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(df[column_name].values))
    if new_column_name == '':
        df[column_name] = lbl.transform(list(df[column_name].values))
    else:
        df[new_column_name] = lbl.transform(list(df[column_name].values))
    return df


encoder_col = ['prefix', 'title', 'tag']
for col in encoder_col:
    print(col)
    data = encode_count(data, col, col + '_id')

drop_feat = [
    'query_prediction_js',
    'prefix',
    'query_prediction',
    'title',
    'tag',
    'label',
    'prediction_num', # query_prediction_feature里面已经存在了
    'prefix_len_click',
    # 'prefix_len_count',
    # 'prefix_len_ctr',
    'prediction_num_click',
    # 'prediction_num_count',
    # 'prediction_num_ctr',
    'prefix_len_tag_click',
    # 'prefix_len_tag_count',
    # 'prefix_len_tag_ctr',
    'prediction_num_tag_click',
    # 'prediction_num_tag_count',
    # 'prediction_num_tag_ctr',
    'prefix_click',
    #  'prefix_count',
    #  'prefix_ctr',
    'title_click',
    #  'title_count',
    #  'title_ctr',
    'tag_click',
    #  'tag_count',
    #  'tag_ctr',
    'query_prediction_click',
#     'query_prediction_count',
#     'query_prediction_ctr',
    'prefix_title_click',
    #  'prefix_title_count',
    #  'prefix_title_ctr',
    'prefix_tag_click',
    #  'prefix_tag_count',
    #  'prefix_tag_ctr',
    'prefix_query_prediction_click',
    'prefix_query_prediction_count',
    'prefix_query_prediction_ctr',
    'title_tag_click',
    #  'title_tag_count',
    #  'title_tag_ctr',
    'title_query_prediction_click',
#      'title_query_prediction_count',
#     'title_query_prediction_ctr',
    'tag_query_prediction_click',
#      'tag_query_prediction_count',
#     'tag_query_prediction_ctr',
    'prefix_title_tag_click',
    #  'prefix_title_tag_count',
    #  'prefix_title_tag_ctr',
    # 'prefix_len_mul_pre_std',
    'title_prefix_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_prewmean_jaccard_distance',
    'title_prewmean_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_prewmean_jaccard_distance',
    'title_prewmean_jaccard_distance'
]

drop_feat2 = [
    'query_prediction_js',
    'query_prediction_id',
    'prefix_id',
    'in_query_big',
    'query_t_recall_char',
    'query_t_precision_char',
    'query_t_acc_char',
    'query_t_recall_word',
    'query_t_precision_word',
    'query_t_acc_word',
    'query_t_recall_char_ngram',
    'query_t_precision_char_ngram',
    'query_t_acc_char_ngram',
    'query_t_recall_word_ngram',
    'query_t_precision_word_ngram',
    'query_t_acc_word_ngram',
    'prefix',
    'query_prediction',
    'title',
    'tag',
    'label',
    'prediction_num',
    'prefix_len_click',
    # 'prefix_len_count',
    # 'prefix_len_ctr',
    'prediction_num_click',
    # 'prediction_num_count',
    # 'prediction_num_ctr',
    'prefix_len_tag_click',
    # 'prefix_len_tag_count',
    # 'prefix_len_tag_ctr',
    'prediction_num_tag_click',
    # 'prediction_num_tag_count',
    # 'prediction_num_tag_ctr',
    'prefix_click',
    'prefix_count',
    'prefix_ctr',
    'title_click',
#     'title_count',
#     'title_ctr',
    'tag_click',
    #  'tag_count',
    #  'tag_ctr',
    'query_prediction_click',
    'query_prediction_count',
    'query_prediction_ctr',
    'prefix_title_click',
    'prefix_title_count',
    'prefix_title_ctr',
    'prefix_tag_click',
    'prefix_tag_count',
    'prefix_tag_ctr',
    'prefix_query_prediction_click',
    'prefix_query_prediction_count',
    'prefix_query_prediction_ctr',
    'title_tag_click',
    'title_tag_count',
    'title_tag_ctr',
    'title_query_prediction_click',
    'title_query_prediction_count',
    'title_query_prediction_ctr',
    'tag_query_prediction_click',
    'tag_query_prediction_count',
    'tag_query_prediction_ctr',
    'prefix_title_tag_click',
    'prefix_title_tag_count',
    'prefix_title_tag_ctr',
    # 'prefix_len_mul_pre_std',s
    'title_prefix_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_prewmean_jaccard_distance',
    'title_prewmean_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_premean_jaccard_distance',
    'title_prewmean_jaccard_distance',
    'title_prewmean_jaccard_distance'
]

categorical_columns_can = []

drop_cols_can = drop_feat
drop_cols = []
data_feat = list(data.columns)
categorical_columns = []
for i in categorical_columns_can:
    if i in data_feat and i not in drop_cols:
        categorical_columns.append(i)

for i in drop_cols_can:
    if i in data_feat:
        drop_cols.append(i)

drop_cols_can2 = drop_feat2
drop_cols2 = []

for i in drop_cols_can2:
    if i in data_feat:
        drop_cols2.append(i)

In [None]:
num_round = 100000
# 阈值
online_threshold = 0.38

train_feat = data[:-(len_val + len_test)].copy()
val_feat = data[-(len_val + len_test):-len_test].copy()
test_feat = data[-len_test:].copy()

print(data['label'].dtype)

In [None]:
print(train_feat.shape)
print(val_feat.shape)
print('test shape:', test_feat.shape)

In [None]:
del data
gc.collect()

In [None]:
feat_colums = list(train_feat.drop(drop_cols, axis=1).columns)
print(len(feat_colums))
# print(feat_colums)

feat_colums2 = list(train_feat.drop(drop_cols2, axis=1).columns)
print(len(feat_colums2))
# print(feat_colums2)

In [None]:
# # train model 1
data_x = train_feat.drop(drop_cols, axis=1)
data_y = train_feat['label'].values

# train model 2
data_x2 = train_feat.drop(drop_cols2, axis=1)
data_y2 = data_y

In [None]:
del train_feat
gc.collect()

In [None]:
data_x = data_x.values
data_x2 = data_x2.values

In [None]:
val_feat_in = val_feat[val_feat['prefix'].isin(prefixs)]
real_val_x_in = val_feat_in.drop(drop_cols, axis=1).values
real_val_y_in = val_feat_in['label'].values
real_val_x_in_2 = val_feat_in.drop(drop_cols2, axis=1).values

val_feat_not_in = val_feat[~val_feat['prefix'].isin(prefixs)]
real_val_x_not_in = val_feat_not_in.drop(drop_cols2, axis=1).values
real_val_y_not_in = val_feat_not_in['label'].values
real_val_x_not_in_2 = val_feat_not_in.drop(drop_cols, axis=1).values

real_val_x_1 = np.concatenate([real_val_x_in, real_val_x_not_in_2])
real_val_x = np.concatenate([real_val_x_in_2, real_val_x_not_in])
real_val_y = np.concatenate([real_val_y_in, real_val_y_not_in])

test_x_in = test_feat.drop(drop_cols, axis=1).values
test_x_not_in = test_feat.drop(drop_cols2, axis=1).values
print ("test is in prefix : ", test_feat[test_feat['prefix'].isin(prefixs)].shape)

val_not_in_prefix_in_title = val_feat_not_in[val_feat_not_in['title'].isin(titles)].drop(drop_cols2, axis=1)
val_not_in_prefix_in_title_label = val_feat_not_in[val_feat_not_in['title'].isin(titles)]['label'].values
print(val_not_in_prefix_in_title.shape)
val_not_in_prefix_not_in_title = val_feat_not_in[~val_feat_not_in['title'].isin(titles)].drop(drop_cols2, axis=1)
val_not_in_prefix_not_in_title_label = val_feat_not_in[~val_feat_not_in['title'].isin(titles)]['label'].values
print(val_not_in_prefix_not_in_title.shape)
val_not_in_prefix_in_title = val_not_in_prefix_in_title.values
val_not_in_prefix_not_in_title = val_not_in_prefix_not_in_title.values

In [None]:
print(val_feat_in.shape)

In [None]:
del val_feat
del val_feat_in
del val_feat_not_in
gc.collect()

In [None]:
lr = 0.05
SEED = 100
params = {
                # num_leaves：128，256
                'num_leaves': 2 ** 7 - 1,
                'objective': 'binary',
                'boosting_type': 'gbdt',
                'max_depth': -1,
                'min_data_in_leaf': 50,
                'learning_rate': 0.01,
                'feature_fraction': 0.65,
                'bagging_fraction': 0.75,
                'bagging_freq': 1,
                'metric': {'auc'},
                'seed': SEED,
                # 'scale_pos_weight':0.899844466771833,
                # 'min_child_weight':5,
                # 'min_split_gain':0,
                # 'subsample_for_bin':50000,
                'nthread': 15,
                #                 'lambda_l1':3,
                #                 'lambda_l2':2,
                'max_bin': 1023,
                # 'device': 'gpu'
            }

train_matrix = lgb.Dataset(pd.DataFrame(data_x, columns=feat_colums), label=data_y,
                                       categorical_feature=categorical_columns)
valid_matrix = lgb.Dataset(pd.DataFrame(real_val_x_in, columns=feat_colums), label=real_val_y_in,
                                       categorical_feature=categorical_columns)

early_stopping_rounds = 100
model_auc = lgb.train(params, train_matrix, num_round, valid_sets=[valid_matrix],
                              early_stopping_rounds=early_stopping_rounds,
                              verbose_eval=500
                              )
best_iter_auc = model_auc.best_iteration

In [None]:
del train_matrix
del valid_matrix
gc.collect()

In [None]:
params = {
                # num_leaves：128，256
                'num_leaves': 2 ** 7 - 1,
                'objective': 'binary',
                'boosting_type': 'gbdt',
                'max_depth': -1,
                'min_data_in_leaf': 50,
                'learning_rate': lr,
                'feature_fraction': 0.65,
                'bagging_fraction': 0.75,
                'bagging_freq': 1,
                'metric': {'auc'},
                'seed': SEED,
                # 'scale_pos_weight':0.899844466771833,
                # 'min_child_weight':5,
                # 'min_split_gain':0,
                # 'subsample_for_bin':50000,
                'nthread': 15,
                #                 'lambda_l1':3,
                #                 'lambda_l2':2,
                'max_bin': 1023,
                # 'device': 'gpu'
            }

train_matrix2 = lgb.Dataset(pd.DataFrame(data_x2, columns=feat_colums2), label=data_y2,
                                        categorical_feature=categorical_columns)
valid_matrix2 = lgb.Dataset(pd.DataFrame(real_val_x, columns=feat_colums2), label=real_val_y,
                                        categorical_feature=categorical_columns)
model2_auc = lgb.train(params, train_matrix2, num_round, valid_sets=[valid_matrix2],
                               early_stopping_rounds=early_stopping_rounds,
                               verbose_eval=500
                               )
best_iter2_auc = model2_auc.best_iteration

In [None]:
del train_matrix2
del valid_matrix2
gc.collect()

In [None]:
print(best_iter2_auc)

real_val_pred_in = model_auc.predict(real_val_x_in, num_iteration=best_iter_auc)
real_val_pred_1 = model_auc.predict(real_val_x_1, num_iteration=best_iter_auc)
print(classification_report(real_val_y, np.where(real_val_pred_1 > online_threshold, 1, 0), digits=6))
print(classification_report(real_val_y_in, np.where(real_val_pred_in > online_threshold, 1, 0), digits=6))

print("different threshold for real val pred in")
max_f1 = 0
max_threshold_in = 0
for threshold in np.arange(0.30, 0.60, 0.01):
    tmp_f1 = f1_score(real_val_y_in, np.where(real_val_pred_in > threshold, 1, 0))
    print(threshold, ' f1 score: ', tmp_f1)
    if tmp_f1 > max_f1:
        max_f1 = tmp_f1
        max_threshold_in = threshold
print('best threshold: ', max_threshold_in, '  f1 :', max_f1)

real_val_pred_not_in = model2_auc.predict(real_val_x_not_in, num_iteration=best_iter2_auc)
real_val_pred_2 = model2_auc.predict(real_val_x, num_iteration=best_iter2_auc)
print(
            classification_report(real_val_y_not_in, np.where(real_val_pred_not_in > online_threshold, 1, 0), digits=6))
print(
            classification_report(real_val_y, np.where(real_val_pred_2 > online_threshold, 1, 0), digits=6))
print("different threshold for real val pred not in")
max_f1 = 0
max_threshold_not_in = 0
for threshold in np.arange(0.30, 0.60, 0.01):
    tmp_f1 = f1_score(real_val_y_not_in, np.where(real_val_pred_not_in > threshold, 1, 0))
    print(threshold, ' f1 score: ', tmp_f1)
    if tmp_f1 > max_f1:
        max_f1 = tmp_f1
        max_threshold_not_in = threshold
print('best threshold: ', max_threshold_not_in, '  f1 :', max_f1)

# 测试不同的阈值组合
print("different threshold for real val pred in and not")
max_f1 = 0
max_threshold_in = 0
max_threshold_not_in = 0
for threshold_in in np.arange(0.30, 0.60, 0.01):
    for threshold_not_in in np.arange(0.30, 0.60, 0.01):
        pred_in = np.where(real_val_pred_in > threshold_in, 1, 0)
        pred_not_in = np.where(real_val_pred_not_in > threshold_not_in, 1, 0)
        pred = np.concatenate([pred_in, pred_not_in])
        tmp_f1 = f1_score(real_val_y, pred)
        #                     print(threshold_in,' ',threshold_not_in , ' f1 score: ', tmp_f1)
        if tmp_f1 > max_f1:
            print(threshold_in, ' ', threshold_not_in, ' f1 score: ', tmp_f1)
            max_f1 = tmp_f1
            max_threshold_in = threshold_in
            max_threshold_not_in = threshold_not_in
print('best threshold for real val: ', max_threshold_in, ' ', max_threshold_not_in, '  f1 :', max_f1)

In [None]:
real_val_pred_in = model_auc.predict(real_val_x_in, num_iteration=model_auc.best_iteration)
real_val_pred_not_in = model2_auc.predict(real_val_x_not_in, num_iteration=model2_auc.best_iteration)

real_val_pred = np.concatenate([real_val_pred_in, real_val_pred_not_in])
pred_in = np.where(real_val_pred_in > 0.41, 1, 0)
pred_not_in = np.where(real_val_pred_not_in > 0.37, 1, 0)
pred = np.concatenate([pred_in, pred_not_in])

print(f1_score(real_val_y,pred))

submission = pd.DataFrame()
submission['predicted_score'] = real_val_pred
submission['label'] = pred

submission.to_csv(
            'pred_val_online_threshold_' + str(online_threshold) + '_feat_' + str(lr) + '_' + str(
                len(feat_colums)) + '_' + str(
                submission['predicted_score'].mean()) + '_' + str(submission['label'].sum()) + '.csv', sep=',', index=False)
submission['label'].to_csv(
            'result_val_online_threshold_' + str(online_threshold) + '_feat_' + str(lr) + '_' + str(
                len(feat_colums)) + '_' + str(
                submission['predicted_score'].mean()) + '_' + str(submission['label'].sum()) + '.csv', sep=',', index=False)

In [None]:
print('test_pred')
# 利用模型对无标测试集做预测
test_pred_in = model_auc.predict(test_x_in, num_iteration=model_auc.best_iteration)

print('test_pred')
# 利用模型对无标测试集做预测
test_pred_not_in = model2_auc.predict(test_x_not_in, num_iteration=model2_auc.best_iteration)

label_in = np.where(test_pred_in > 0.41, 1, 0)
label_not_in = np.where(test_pred_not_in > 0.37, 1, 0)

label = np.zeros((test_feat.shape[0],), dtype=int)

label[test_feat['prefix'].isin(prefixs)] = label_in[test_feat['prefix'].isin(prefixs)]
label[~test_feat['prefix'].isin(prefixs)] = label_not_in[~test_feat['prefix'].isin(prefixs)]

predict = np.zeros((test_feat.shape[0],), dtype=float)

predict[test_feat['prefix'].isin(prefixs)] = test_pred_in[test_feat['prefix'].isin(prefixs)]
predict[~test_feat['prefix'].isin(prefixs)] = test_pred_not_in[~test_feat['prefix'].isin(prefixs)]

submission = pd.DataFrame()

submission['predicted_score'] = predict
submission['label'] = label

print(np.mean(submission['predicted_score']))
print(np.sum(submission['label']))

submission.to_csv(
            'pred_online_threshold_' + str(online_threshold) + '_feat_' + str(lr) + '_' + str(
                len(feat_colums)) + '_' + str(
                submission['predicted_score'].mean()) + '_' + str(submission['label'].sum()) + '.csv', sep=',', index=False)