# 特征提取

In [1]:
import sys, os, gc
import numpy as np
import pandas as pd
import pickle
sys.path.append("./utils")
from helper import ReadCSV, Timer, ExtractFeature, ORI_TRAIN_NAMES, ORI_TEST_NAMES, AnalysisCSV, FuncMap2, FuncMap1
import dist_utils, ngram_utils
base_feature_save_dir = "./outputs/features/"
OFFLINE = True
test_file = "./inputs/test.csv"
if OFFLINE:
    base_prefix = "debug_"
    train_file = "./inputs/train_last_50w.csv"
    CHUNK_SIZE = 100000
else:
    base_prefix = "online_"
    train_file = "./inputs/train_last_1000w.csv"
    CHUNK_SIZE = 1000000
print(OFFLINE)

True


In [2]:
# ! pip install  --index "http://pypi/simple" --trusted-host pypi fuzzywuzzy
# ! pip install  --index "http://pypi/simple" --trusted-host pypi  pyemd
# ! pip install  --index "http://pypi/simple" --trusted-host pypi simhash
# ! pip install  --index "http://pypi/simple" --trusted-host pypi networkx

## 文本挖掘特征

In [5]:
from fuzzywuzzy import fuzz
feature_save_dir = base_feature_save_dir + "textMining/"
train_feature_prefix = base_prefix + "trainTextMining"
test_feature_prefix = base_prefix + "testTextMining"
! ls -alh ./outputs/features/textMining/

total 0
drwxr-xr-x  2 niudong  staff    64B  7 12 23:43 [1m[36m.[m[m
drwxr-xr-x  3 niudong  staff    96B  7 13 16:53 [1m[36m..[m[m


### 提取长度特征

In [7]:
regen_len_feature = False
feature_name = "len"

In [8]:
def run_text_len(df, ngram, prefix):
    q_list = df['query'].apply(ngram)
    t_list = df['title'].apply(ngram)
    with Timer("extract length"):
        q_len = FuncMap2(len, q_list)
        t_len = FuncMap2(len, t_list)
        df['%s_qLen' % prefix] = q_len
        df['%s_tLen' % prefix] = t_len
        df['%s_qtLenRatio'%prefix] = FuncMap2(lambda a, b: a/b, q_len, t_len)
        df['%s_tqLenRatio'%prefix] = FuncMap2(lambda a, b: a/b, t_len, q_len)
        df['%s_qtDiff'%prefix] = FuncMap2(lambda a, b: abs(a-b), q_len, t_len)
        df['%s_qtMax'%prefix] = FuncMap2(lambda a, b: max(a, b), q_len, t_len)
        df['%s_qtMin'%prefix] = FuncMap2(lambda a, b: min(a, b), q_len, t_len)
        df['%s_qtAvg'%prefix] = FuncMap2(lambda a, b: (a+b)/2, q_len, t_len)
        del q_len, t_len, q_list, t_list
        gc.collect()
    return df
def process_text_len(df, save_dir, prefix, feature_name):
    df = run_text_len(df, ngram_utils.unichars, '%s_%s' % (prefix, 'unichars'))
    df = run_text_len(df, ngram_utils.unigrams, '%s_%s' % (prefix, 'unigrams'))
    return df

In [9]:
# 提取训练数据特征
if regen_len_feature:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, process_func=process_text_len, 
        names=ORI_TRAIN_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], drop_last_cols=['query', 'title'])

In [7]:
# AnalysisCSV("./stage1/output/text_mining_feature/debug_trainTextMining_len.csv.gz")

In [10]:
if not OFFLINE and regen_len_feature:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, process_func=process_text_len, 
    names=ORI_TEST_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
    drop_first_cols=['query_id', 'query_title_id'], drop_last_cols=['query', 'title'])

In [11]:
# AnalysisCSV("./stage1/output/text_mining_feature/debug_train_text_mining_feature_len.csv")

time: 989 µs


### 提取字符编辑距离特征

In [16]:
import Levenshtein
regen_edit_sim_feature = False
feature_name = "editSim"

In [17]:
similar_arr = [
    # https://www.jb51.net/article/98449.htm;
    # http://www.coli.uni-saarland.de/courses/LT1/2011/slides/Python-Levenshtein.html#Levenshtein-inverse
    Levenshtein.distance,
    Levenshtein.jaro,
    Levenshtein.jaro_winkler,
    # https://blog.csdn.net/qq_43174128/article/details/82595317
    fuzz.ratio,
    fuzz.partial_ratio,
    fuzz.token_sort_ratio,
    fuzz.partial_token_sort_ratio,
    fuzz.token_set_ratio
]

In [18]:
def run_edit_sim_feature(df, prefix):
    q_list = df['query']
    t_list = df['title']
    with Timer("extract edit sim"):
        for _ in similar_arr:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_{}".format(prefix, name)] = FuncMap2(_, q_list, t_list)
        del q_list, t_list
        gc.collect()
    return df
def process_edit_sim_feature(df, save_dir, prefix, feature_name):
    df = run_edit_sim_feature(df, '%s_%s' % (prefix, 'text'))
    return df

In [19]:
if regen_edit_sim_feature:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_edit_sim_feature, names=ORI_TRAIN_NAMES, process_chunkly=False, 
        chunk_size=CHUNK_SIZE, drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

In [20]:
if not OFFLINE and regen_edit_sim_feature:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
        process_func=process_edit_sim_feature, names=ORI_TEST_NAMES, process_chunkly=False, 
        chunk_size=CHUNK_SIZE, drop_first_cols=['query_id', 'query_title_id'], 
        drop_last_cols=['query', 'title'])

### 提取集合相似度特征

In [21]:
import Levenshtein
regen_ngram_sim_feature = False
feature_name = "ngramSim"

In [22]:
similar_arr = [
    # 集合的交集关系
    # 可以操作字符串，如"abc"；也可以操作字符数组，如['a', 'b', 'c']
    dist_utils.dice_ratio,
    dist_utils.jaccard_ratio,
    dist_utils.edit_seq_ratio,
    dist_utils.edit_set_ratio,
]

In [25]:
def run_ngram_similarity(df, ngram, prefix):
    q_list = df['query'].apply(ngram)
    t_list = df['title'].apply(ngram)
    with Timer("extract ngram sim"):
        for _ in similar_arr:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_{}".format(prefix, name)] = FuncMap2(_, q_list, t_list)
        del q_list, t_list
        gc.collect()
    return df

def process_ngram_similarity(df, save_dir, prefix, feature_name):
    df = run_ngram_similarity(df, ngram_utils.unichars, '%s_%s' % (prefix, 'unichars'))
    df = run_ngram_similarity(df, ngram_utils.bichars, '%s_%s' % (prefix, 'bichars'))
    df = run_ngram_similarity(df, ngram_utils.trichars, '%s_%s' % (prefix, 'trichars'))
    df = run_ngram_similarity(df, ngram_utils.unigrams, '%s_%s' % (prefix, 'unigrams'))
    df = run_ngram_similarity(df, ngram_utils.bigrams, '%s_%s' % (prefix, 'bigrams'))
    df = run_ngram_similarity(df, ngram_utils.trigrams, '%s_%s' % (prefix, 'trigrams'))
    return df

In [23]:
# 提取训练数据特征
if regen_ngram_sim_feature:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_ngram_similarity, names=ORI_TRAIN_NAMES, process_chunkly=False, 
        chunk_size=CHUNK_SIZE, drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

In [41]:
# AnalysisCSV("./stage1/output/text_mining_feature/debug_trainTextMining_ngramSim.csv.gz")

In [24]:
if not OFFLINE and regen_ngram_sim_feature:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
    process_func=process_ngram_similarity, names=ORI_TEST_NAMES, process_chunkly=False, 
    chunk_size=CHUNK_SIZE, drop_first_cols=['query_id', 'query_title_id'], 
    drop_last_cols=['query', 'title'])

### SimHash特征

In [33]:
from simhash import Simhash
# https://github.com/cjauvin/simhash/blob/master/tests/test_simhash.py
# https://leons.im/posts/a-python-implementation-of-simhash-algorithm/
# 这个不错: http://yanyiwu.com/work/2014/01/30/simhash-shi-xian-xiang-jie.html
tfidf_model_file = "./outputs/models/online_tfidf_model.bin"
regen_simhash = False
feature_name = "ngramHashsim"

In [35]:
def process_ngram_simhash(df, prefix):
    with Timer("cal tfidf vec"):
        with open(tfidf_model_file, "rb") as ff:
            tfidf_model = pickle.load(ff)
            voc = {i:w for w, i in tfidf_model.vocabulary_.items()}
        q_vecs = tfidf_model.transform(df['query'])
        t_vecs = tfidf_model.transform(df['title'])
    with Timer("cal sim hash"):
        q_simhash = [
            Simhash(
                zip([voc[j] for j in Di.indices], Di.data)
            )
            for Di in q_vecs
        ]
        t_simhash = [
            Simhash(
                zip([voc[j] for j in Di.indices], Di.data)
            )
            for Di in t_vecs
        ]
    with Timer("cal hash sim"):
        df["{}Simhash".format(prefix)] = FuncMap2(lambda x,y: x.distance(y), q_simhash, t_simhash)
        
    del q_vecs, t_vecs, q_simhash, t_simhash, tfidf_model, voc
    gc.collect()
    
    return df

def process_ngram_simhash(df, save_dir, prefix, feature_name):
    df = run_ngram_simhash(df, prefix)
    return df

In [36]:
# 提取训练数据特征
if regen_simhash:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
                   process_func=process_ngram_simhash, names=ORI_TRAIN_NAMES, process_chunkly=False, 
                   chunk_size=CHUNK_SIZE, drop_first_cols=['query_id', 'query_title_id', 'label'], 
                   drop_last_cols=['query', 'title'])

### 最长公共子串

In [48]:
import py_common_subseq
regen_subseq = False
feature_name = "commonSeq"
print(py_common_subseq.find_common_subsequences("123", "13423"))

{'', '23', '123', '2', '3', '1', '13', '12'}


In [49]:
def run_ngram_subseq(df, ngram, prefix):
    query = df["query"].apply(ngram)
    title = df["title"].apply(ngram)
    with Timer("cal sub seq"):
        df["{}LCSValue".format(prefix)] = FuncMap2(lambda x, y:py_common_subseq.find_common_subsequences(x, y),query, title)
        del query, title
        gc.collect()
    return df

def process_ngram_subseq(df, save_dir, prefix, feature_name):
    df = run_ngram_subseq(df, ngram_utils.unichars, '%s_%s' % (prefix, 'unichars'))
    df = run_ngram_subseq(df, ngram_utils.bichars, '%s_%s' % (prefix, 'bichars'))
    df = run_ngram_subseq(df, ngram_utils.trichars, '%s_%s' % (prefix, 'trichars'))
    df = run_ngram_subseq(df, ngram_utils.unigrams, '%s_%s' % (prefix, 'unigrams'))
    df = run_ngram_subseq(df, ngram_utils.bigrams, '%s_%s' % (prefix, 'bigrams'))
    df = run_ngram_subseq(df, ngram_utils.trigrams, '%s_%s' % (prefix, 'trigrams'))
    return df 

In [50]:
# 提取训练数据特征
if regen_subseq:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_ngram_subseq, names=ORI_TRAIN_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], drop_last_cols=['query', 'title'])

In [51]:
# 提取训练数据特征
if not OFFLINE and regen_subseq:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
        process_func=process_ngram_subseq, names=ORI_TEST_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], drop_last_cols=['query', 'title'])

## 向量空间特征

In [64]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
feature_save_dir = base_feature_save_dir + "vectorSpace/"
train_feature_prefix = base_prefix + "trainVectorSpace"
test_feature_prefix = base_prefix + "testVectorSpace"
ngram_range, ngram_level = (1, 4), "word"
tfidf_model_file = "./oututs/models/tfidf_{}_ngram{}{}.bin".format(
    ngram_level, ngram_range[0], ngram_range[1])
! ls -alh ./outputs/features/vectorSpace/
print(tfidf_model_file)

total 0
drwxr-xr-x  2 niudong  staff    64B  7 13 18:32 [1m[36m.[m[m
drwxr-xr-x  4 niudong  staff   128B  7 13 18:32 [1m[36m..[m[m
./oututs/models/tfidf_word_ngram14.bin


### 训练tfidf模型

In [65]:
regen_tfidf_model = False
tfidf_train_file = "./inputs/train_last_1000w.csv"

In [66]:
def train_tfidf(df, prefix, savefile):
    word_tfidf = TfidfVectorizer(norm="l2",  # 'l2 norm' can cal similarty
                                    strip_accents="unicode",
                                    analyzer=ngram_level,
                                    ngram_range=ngram_range,
                                    use_idf=True,
                                    smooth_idf=True,
                                    sublinear_tf=True, min_df=5, max_df=0.9)
    new_query = df["query"].unique()
    new_title = df["title"].values
    corpus = np.concatenate([new_query, new_title])
    del new_query, new_title, df
    gc.collect()
    with Timer("train tfidf"):
        word_tfidf.fit(corpus)
        with open(tfidf_model_file, "wb") as f:
            pickle.dump(word_tfidf, f)
            print("Model dumped to {}".format(tfidf_model_file))
    return None

In [5]:
# 提取训练数据特征
if regen_tfidf_model:
    ExtractFeature(tfidf_train_file, feature_save_dir, train_feature_prefix, None, 
        process_func=train_tfidf, names=ORI_TRAIN_NAMES, process_chunkly=False, 
        chunk_size=CHUNK_SIZE, drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

----->Started 'extract tfidf feature' block...
Mem. usage decreased to 76.29 Mb (0.0% reduction)
----->Started 'tf-idf fit' block...


### tfidf向量相似性

In [73]:
regen_tfidf_vec_sim = False
feature_name = "tfidfVecSim"
print(tfidf_model_file)

./oututs/models/tfidf_word_ngram14.bin


In [69]:
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
from sklearn.metrics.pairwise import *
similar_dis = [
    paired_cosine_distances,
    paired_euclidean_distances,
    paired_manhattan_distances
]

In [77]:
def process_tfidf_vec_sim(df, save_dir, prefix, feature_name):
    with Timer("cal tfidf vec"):
        with open(tfidf_model_file, "rb") as ff:
            tfidf_model = pickle.load(ff)
        q_vec = tfidf_model.transform(df["query"])
        t_vec = tfidf_model.transform(df["title"])
    with Timer("cal tfidf sim"):
        for _ in similar_dis:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_{}".format(prefix, name)] = _(q_vec, t_vec)
    return df

In [78]:
if regen_tfidf_vec_sim:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_tfidf_vec_sim, names=ORI_TRAIN_NAMES,
        process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

In [79]:
if not OFFLINE and regen_tfidf_vec_sim:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
        process_func=process_tfidf_vec_sim, names=ORI_TEST_NAMES,
        process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

### tfidf长度特征

In [85]:
from np_utils import try_divide
regen_tfidf_vec_len = False
import scipy, numpy
feature_name = "tfidfVecLen"
print(tfidf_model_file)

./oututs/models/tfidf_word_ngram14.bin


In [83]:
def process_tfidf_vec_len(df, save_dir, prefix, feature_name):
    with Timer("cal tfidf vec"):
        with open(tfidf_model_file, "rb") as ff:
            tfidf_model = pickle.load(ff)
        q_vec_len = np.array(tfidf_model.transform(df["query"]).sum(1)).squeeze()
        t_vec_len = np.array(tfidf_model.transform(df["title"]).sum(1)).squeeze()
    with Timer("cal tfidf len"):
        df['%s_QLen' % prefix] = q_vec_len
        df['%s_TLen' % prefix] = t_vec_len
        df['%s_QTLenRatio'%prefix] = FuncMap2(lambda a, b: try_divide(a, b), q_vec_len, t_vec_len)
        df['%s_TQLenRatio'%prefix] = FuncMap2(lambda a, b: try_divide(b, a), q_vec_len, t_vec_len)
        df['%s_QTDiff'%prefix] = FuncMap2(lambda a, b: abs(a-b), q_vec_len, t_vec_len)
        df['%s_QTMax'%prefix] = FuncMap2(lambda a, b: max(a, b), q_vec_len, t_vec_len)
        df['%s_QTMin'%prefix] = FuncMap2(lambda a, b: min(a, b), q_vec_len, t_vec_len)
        df['%s_QTAvg'%prefix] = FuncMap2(lambda a, b: (a+b)/2, q_vec_len, t_vec_len)
        df['%s_QTMulti'%prefix] = FuncMap2(lambda a, b: a*b, q_vec_len, t_vec_len)
        # df['%s_QSquare2'%prefix] = numpy.square(q_vec_len)
        # df['%s_TSquare2'%prefix] = numpy.square(t_vec_len)
        # df['%s_QSqrt'%prefix] = numpy.sqrt(q_vec_len)
        # df['%s_TSqrt'%prefix] = numpy.sqrt(t_vec_len)
        # df['%s_QLog'%prefix] = numpy.log(q_vec_len)
        # df['%s_TLog'%prefix] = numpy.log(t_vec_len)
        # p_corr = scipy.stats.pearsonr(q_vec_len, t_vec_len)[0]
        # df['%s_PCorr'%prefix] = [p_corr] * len(q_vec_len)
        del q_vec_len, t_vec_len
        gc.collect()
    return df

In [86]:
if regen_tfidf_vec_len:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_tfidf_vec_len, names=ORI_TRAIN_NAMES, 
        process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

In [87]:
if not OFFLINE and regen_tfidf_vec_len:
    ExtractFeature(test_ file, feature_save_dir, test_feature_prefix, feature_name, 
        process_func=process_tfidf_vec_len, names=ORI_TEST_NAMES, 
        process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id'], 
        drop_last_cols=['query', 'title'])

## 提取word2vec特征

In [90]:
import pickle
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
from scipy.stats import skew, kurtosis
word2vec_model_file = "./outputs/models/word2vec.kv"
ngram_range, ngram_level = (1, 4), "word"
tfidf_model_file = "./oututs/models/tfidf_{}_ngram{}{}.bin".format(
    ngram_level, ngram_range[0], ngram_range[1])
feature_save_dir = base_feature_save_dir + "word2vec/"
train_feature_prefix = base_prefix + "trainWord2vec"
test_feature_prefix = base_prefix + "testWord2vec"
! ls -alh ./outputs/features/word2vec

total 0
drwxr-xr-x  2 niudong  staff    64B  7 13 20:41 [1m[36m.[m[m
drwxr-xr-x  5 niudong  staff   160B  7 13 20:41 [1m[36m..[m[m


In [92]:
def sent2vec1(text):
    # 计算embedding均值
    global word2vec_model, tfidf_idf, tfidf_vocab
    words = text.split()
    words_num = len(words)
    return np.nan_to_num(
          np.array([np.array(word2vec_model[w])*(
             (1/words_num) * tfidf_idf[tfidf_vocab[w]]
             ) for w in words 
             if w in word2vec_model and w in tfidf_vocab] or [0.0] * 200
          ).sum(axis=0)
      )

In [91]:
def sent2vec(text):
    # 计算embedding均值
    global word2vec_model
    return np.nan_to_num(
          np.array([word2vec_model[w] for w in text.split() if w in word2vec_model] or [0.0] * 200
          ).mean(axis=0)
      )

### 生成word2vec句子相似度

In [94]:
# word2vec to sent: https://www.zhihu.com/question/29978268
regen_word2vec_sen_vec_sim = False
process_word2vec_sen_vec_mode = "avg"  # "avg" or "tfidf"
feature_name = "senVecSim{}".format(process_word2vec_sen_vec_mode.capitalize())

In [95]:
similar_dis = [
    dist_utils.cosine_distance, 
    dist_utils.jaccard_distance,
    dist_utils.braycurtis_distance,
    dist_utils.canberra_distance,
    dist_utils.cityblock_distance,
    dist_utils.euclidean_distance,
    dist_utils.minkowski_distance
]

In [96]:
if regen_word2vec_sen_vec_sim:
    word2vec_model = KeyedVectors.load(word2vec_model_file, mmap='r')

In [97]:
if regen_word2vec_sen_vec_sim and process_word2vec_sen_vec_mode == "tfidf":
    with open(tfidf_model_file, "rb") as ff:
        tfidf_model = pickle.load(ff)
    tfidf_idf = tfidf_model.idf_
    tfidf_vocab = tfidf_model.vocabulary_

In [100]:
def process_word2vec_sen_vec(df, save_dir, prefix, feature_name):
    with Timer("cal sent vec"):
        if process_word2vec_sen_vec_mode == "avg":
            q_sen_vec = list(map(sent2vec, df["query"]))
            t_sen_vec = list(map(sent2vec, df["title"]))
        elif process_word2vec_sen_vec_mode == "tfidf":
            q_sen_vec = list(map(sent2vec1, df["query"]))
            t_sen_vec = list(map(sent2vec1, df["title"]))
        for _ in similar_dis:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_{}".format(prefix, name)] = FuncMap2(_, q_sen_vec, t_sen_vec)
        del q_sen_vec, t_sen_vec
        gc.collect()
    return df

In [101]:
if regen_word2vec_sen_vec_sim:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_word2vec_sen_vec, names=ORI_TRAIN_NAMES,
        process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

In [102]:
# 提取训练数据特征
if regen_word2vec_sen_vec_sim and not OFFLINE:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
        process_func=process_word2vec_sen_vec, names=ORI_TEST_NAMES, 
        process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id'], 
        drop_last_cols=['query', 'title'])

### word2vec其他特征

In [104]:
word2vec_model_norm = None
regen_word2vec_sen_vec_other = False
process_word2vec_sen_vec_mode = "avg"
feature_name = "senVecOther{}".format(process_word2vec_sen_vec_mode.capitalize())

In [111]:
def norm_wmd(s1, s2):
    global word2vec_model_norm
    dis = np.nan_to_num(word2vec_model_norm.wmdistance(s1, s2))
    return dis if dis<100 else 100

In [112]:
def wmd(s1, s2):
    global word2vec_model
    dis = np.nan_to_num(word2vec_model.wmdistance(s1, s2))
    return dis if dis<100 else 100

In [113]:
def run_word2vec_sen_vec_wmd(df, ngram, prefix, name):
    q_list = df['query'].apply(ngram)
    t_list = df['title'].apply(ngram)
    df['%s_wmd%s' % (prefix, name)] = FuncMap2(wmd, q_list, t_list)
    df['%s_normWmd%s' % (prefix, name)] = FuncMap2(norm_wmd, q_list, t_list)
    del q_list, t_list
    gc.collect()
    return df

In [114]:
def run_word2vec_sen_vec_sk(df, func, q_sen_vec, t_sen_vec, prefix, name):
    q_value = FuncMap1(func, q_sen_vec)
    t_value = FuncMap1(func, t_sen_vec)
    df['%s_q%s' % (prefix, name)] = q_value
    df['%s_t%s' % (prefix, name)] = t_value
    df['%s_%sDiff'%(prefix, name)] = FuncMap2(lambda a, b: abs(a-b), q_value, t_value)
    df['%s_%sMax'%(prefix, name)] = FuncMap2(lambda a, b: max(a, b), q_value, t_value)
    df['%s_%sMin'%(prefix, name)] = FuncMap2(lambda a, b: min(a, b), q_value, t_value)
    df['%s_%sAvg'%(prefix, name)] = FuncMap2(lambda a, b: (a+b)/2, q_value, t_value)
    return df

In [115]:
def process_word2vec_sen_vec_other(df, save_dir, prefix, feature_name):
    
    global word2vec_model_norm, word2vec_model_file
    word2vec_model_norm = KeyedVectors.load(word2vec_model_file)
    word2vec_model_norm.init_sims(replace=True)
    
    # with Timer("cal uni-wmd dis"):
    #     df = run_word2vec_sen_vec_wmd(df, ngram_utils.unigrams, prefix, "Unigrams")
    # with Timer("cal bi-wmd dis"):
    #     df = run_word2vec_sen_vec_wmd(df, ngram_utils.bigrams, prefix, "Bigrams")
        
    with Timer("cal sent vec"):
        global process_word2vec_sen_vec_mode
        if process_word2vec_sen_vec_mode == "avg":
            q_sen_vec = FuncMap1(sent2vec, df["query"])
            t_sen_vec = FuncMap1(sent2vec, df["title"])
        elif process_word2vec_sen_vec_mode == "tfidf":
            q_sen_vec = FuncMap1(sent2vec1, df["query"])
            t_sen_vec = FuncMap1(sent2vec1, df["title"])
    
    with Timer("cal skew"):
        df = run_word2vec_sen_vec_sk(df, skew, q_sen_vec, t_sen_vec, prefix, "Skew")
    with Timer("cal kurtosis"):
        df = run_word2vec_sen_vec_sk(df, kurtosis, q_sen_vec, t_sen_vec, prefix, "Kurtosis")
        
    del q_sen_vec, t_sen_vec
    gc.collect()
    
    return df

In [117]:
if regen_word2vec_sen_vec_other:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name,
        process_func=process_word2vec_sen_vec_other, names=ORI_TRAIN_NAMES,
        process_chunkly=True, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

## Magic特征

In [119]:
feature_save_dir = base_feature_save_dir + "magic/"
train_feature_prefix = base_prefix + "trainMagic"
test_feature_prefix = base_prefix + "testMagic"
! ls -alh ./outputs/magic

ls: ./outputs/magic: No such file or directory


### 提取query频率

In [120]:
regen_query_freq = False
train_query_freq = train_feature_prefix + "_queryFreq"
train_freq_save_file = feature_save_dir+train_query_freq+".csv.gz"
if not OFFLINE:
    test_query_freq = test_feature_prefix + "_queryFreq"
    test_freq_save_file = feature_save_dir+test_query_freq+".csv.gz"

In [121]:
def ExtractFreq(sourcefile, savefile, names, group_by, sort_col, feature_name):
    df = ReadCSV(sourcefile, names=names, iterator=False)
    grouped = df.groupby(group_by, as_index=False)[sort_col].count()
    res = np.concatenate(list(map(lambda x: [x]*x, grouped[sort_col])))
    tmp = pd.DataFrame({
        feature_name: res
    })
    tmp.to_csv(savefile, compression="gzip", index=None)
    print("Freq feature saved to {}".format(savefile))
    del df, grouped, res, tmp
    gc.collect()

In [122]:
if regen_query_freq:
    ExtractFreq(train_file, 
        train_freq_save_file, 
        ORI_TRAIN_NAMES, 
        "query_id",
        "query_title_id",
        "query_freq")

In [123]:
if regen_query_freq and not OFFLINE:
    ExtractFreq(test_file, 
        test_freq_save_file, 
        ORI_TEST_NAMES, 
        "query_id",
        "query_title_id",
        "query_freq")

### 提取差集Sim

In [155]:
regen_diff_sim = False
feature_name = "diffSetSim"
similar_arr = [
    dist_utils.dice_ratio,
    dist_utils.jaccard_ratio,
    dist_utils.edit_seq_ratio,
    dist_utils.edit_set_ratio,
]
diffSetTitle_baseDir = feature_save_dir + feature_name + "Title"

In [139]:
def findDiff(arr, ngram):
    set_list = FuncMap1(lambda x: set(ngram(x)), arr)
    res = []
    for i, _ in enumerate(set_list):
        result = _
        for __ in set_list:
            if _ != __:
                result = result.difference(__)
        res.append(list(result))
    return res

In [140]:
def GetDiffSet(query_id, title, ngram):
    tmp, res = [], []
    for i in range(len(query_id)):
        if i == 0:
            tmp.append(title[i])
        else:
            if query_id[i] == query_id[i-1]:
                tmp.append(title[i])
            else:
                res.append(findDiff(tmp, ngram))
                tmp = [title[i]]
    res.append(findDiff(tmp, ngram))
    result = []
    for _ in res:
        result.extend(_)
    return result

In [143]:
def run_diff_set_sim(df, ngram, prefix, name):
    
    with Timer("cal diff set"):
        t_savefile = diffSetTitle_baseDir + name +".npy"
        q_list = df['query'].apply(lambda x: set(ngram(x)))
        if os.path.exists(t_savefile):
            t_list = np.load(t_savefile, allow_pickle=True)
        else:
            t_list = GetDiffSet(df["query_id"], df["title"], ngram)
            np.save(t_savefile, t_list)
            print("Title sent saved to {}".format(t_savefile))

    with Timer("extract diffSet sim"):
        for _ in similar_arr:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_diffSetSim{}".format(prefix, name)] = FuncMap2(_, q_list, t_list)
    
    del q_list, t_list
    gc.collect()
    return df
    
def process_diff_set_sim(df, save_dir, prefix, feature_name):
    df = run_diff_set_sim(df, ngram_utils.unigrams, prefix, "Unigram")
    return df

In [144]:
if regen_diff_sim:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_diff_set_sim, 
        names=ORI_TRAIN_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['label', 'query_title_id'], drop_last_cols=['query_id', 'query', 'title'])

In [145]:
if regen_diff_sim and not OFFLINE:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
        process_func=process_diff_set_sim, 
        names=ORI_TEST_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_title_id'], drop_last_cols=['query_id', 'query', 'title'])

### 提取差集Len

In [165]:
regen_diffSet_len = False
feature_name = "diffSetLen"

In [173]:
def run_diff_set_len(df, ngram, prefix, name):
    with Timer("cal diff set"):
        t_savefile = diffSetTitle_baseDir + name +".npy"
        q_list = df['query'].apply(lambda x: set(ngram(x)))
        if os.path.exists(t_savefile):
            t_list = np.load(t_savefile, allow_pickle=True)
        else:
            t_list = GetDiffSet(df["query_id"], df["title"], ngram)
            np.save(t_savefile, t_list)
            print("Title sent saved to {}".format(t_savefile))
    with Timer("extract diffSet len"):
        q_len = FuncMap1(len, q_list)
        t_len = FuncMap1(len, t_list)
        df['%s_diffQLen' % prefix] = q_len
        df['%s_diffTLen' % prefix] = t_len
        df['%s_diffQTLenRatio'%prefix] = FuncMap2(lambda a, b: a/b, q_len, t_len)
        df['%s_diffTQLenRatio'%prefix] = FuncMap2(lambda a, b: b/a, q_len, t_len)
        df['%s_diffQTDiff'%prefix] = FuncMap2(lambda a, b: abs(a-b), q_len, t_len)
        df['%s_diffQTMax'%prefix] = FuncMap2(lambda a, b: max(a, b), q_len, t_len)
        df['%s_diffQTMin'%prefix] = FuncMap2(lambda a, b: min(a, b), q_len, t_len)
        df['%s_diffQTAvg'%prefix] = FuncMap2(lambda a, b: (a+b)/2, q_len, t_len)
    del q_list, t_list, q_len, t_len
    gc.collect()
    return df
    
def process_diff_set_len(df, prefix, savefile):
    df = run_diff_set_len(df, ngram_utils.unigrams, prefix, "Unigram")
    return df

In [174]:
if regen_diffSet_len:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_diff_set_len, 
        names=ORI_TRAIN_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['label', 'query_title_id'], drop_last_cols=['query_id', 'query', 'title'])

In [175]:
if regen_diffSet_len and not OFFLINE: 
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
        process_func=process_diff_set_len, 
        names=ORI_TEST_NAMES, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['label', 'query_title_id'], drop_last_cols=['query_id', 'query', 'title'])

### 提取差集tfidf

In [148]:
from np_utils import try_divide
ngram_range, ngram_level = (1, 4), "word"
tfidf_model_file = "./oututs/models/tfidf_{}_ngram{}{}.bin".format(
    ngram_level, ngram_range[0], ngram_range[1])
regen_diffSet_tfidf_len = False
feature_name = "diffSetTfidfLen"

In [149]:
if regen_diffSet_tfidf_len:
    with open(tfidf_model_file, "rb") as ff:
        tfidf_model = pickle.load(ff)
        tfidf_vocab = tfidf_model.vocabulary_
        tfidf_idf = tfidf_model.idf_

In [48]:
def get_uni_sum_tfidf(ngrams):
    l = len(ngrams)
    return sum([tfidf_idf[tfidf_vocab[_]]/l for _ in ngrams if _ in tfidf_vocab])

In [176]:
def run_diff_set_tfidf_len(df, ngram, prefix, savefile):
    with Timer("cal diff set"):
        t_savefile = diffSetTitle_baseDir + name +".npy"
        q_list = df['query'].apply(lambda x: set(ngram(x)))
        if os.path.exists(t_savefile):
            t_list = np.load(t_savefile, allow_pickle=True)
        else:
            t_list = GetDiffSet(df["query_id"], df["title"], ngram)
            np.save(t_savefile, t_list)
            print("Title sent saved to {}".format(t_savefile))
    with Timer("extract similarity"):
        t_vec_len = list(map(get_uni_sum_tfidf, t_list))
        q_vec_len = np.array(tfidf_model.transform(df["query"]).sum(1)).squeeze()
        df['%s_TLen' % prefix] = t_vec_len
        df['%s_QTLenRatio'%prefix] = list(map(lambda a, b: try_divide(a, b), q_vec_len, t_vec_len))
        df['%s_TQLenRatio'%prefix] = list(map(lambda a, b: try_divide(b, a), q_vec_len, t_vec_len))
        df['%s_QTDiff'%prefix] = list(map(lambda a, b: abs(a-b), q_vec_len, t_vec_len))
        df['%s_QTMax'%prefix] = list(map(lambda a, b: max(a, b), q_vec_len, t_vec_len))
        df['%s_QTMin'%prefix] = list(map(lambda a, b: min(a, b), q_vec_len, t_vec_len))
        df['%s_QTAvg'%prefix] = list(map(lambda a, b: (a+b)/2, q_vec_len, t_vec_len))
        df['%s_QTMulti'%prefix] = list(map(lambda a, b: a*b, q_vec_len, t_vec_len))
    del t_list, t_vec_len, q_vec_len
    gc.collect()
    return df
    
def process_diff_set_tfidf_len(df, save_dir, prefix, feature_name):
    savefile = os.path.join(save_dir, '%s_%s' % (prefix, feature_name))
    df = run_diff_set_tfidf_len(df, ngram_utils.unigrams, "{}{}".format(prefix, "Unigram"), "{}{}".format(savefile,"Unigram"))
    return df

In [177]:
if regen_diffSet_tfidf_len:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
        process_func=process_diff_set_tfidf_len, names=ORI_TRAIN_NAMES, process_chunkly=False, 
        chunk_size=CHUNK_SIZE, drop_first_cols=['label', 'query_title_id'], 
        drop_last_cols=['query_id', 'query', 'title'])

### 点击率特征

In [185]:
regen_ctr_rate = False
feature_name = "ctrLocal"
stat = {}

In [192]:
def ExtractLocalCTR(df, save_dir, prefix, feature_name):
    global stat
    grouped = df.groupby("query_id", as_index=False)
    count = grouped.size().values
    if not stat:
        label = grouped["label"].sum()["label"].values.astype("int32")
        for _ in range(len(count)):
            item_len = count[_]
            if item_len not in stat:
                stat[item_len] = [1, label[_]]
            else:
                stat[item_len][0] += 1
                stat[item_len][1] += label[_]
        del label
    res = [[ stat[_][1] / (_ * stat[_][0])]*_ for _ in count]
    result = []
    for _ in res:
        result.extend(_)
    df["{}_localCTR".format(prefix)] = result
    del res, grouped
    gc.collect()
    return df

In [189]:
if regen_ctr_rate:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, feature_name, 
    ExtractLocalCTR, names=ORI_TRAIN_NAMES,process_chunkly=False, 
    drop_first_cols=['query_title_id', 'query', 'title'], drop_last_cols=['query_id', 'label'])

In [190]:
if not OFFLINE and regen_ctr_rate:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, feature_name, 
    ExtractLocalCRT, names=ORI_TEST_NAMES,process_chunkly=False, 
    drop_first_cols=['query_title_id', 'query', 'title'], drop_last_cols=['query_id'])