# 特征提取

In [1]:
import sys, os, gc
import numpy as np
import pandas as pd
sys.path.append("./global")
from helper import ReadCSV, Timer, ExtractFeature, ORI_TRAIN_NAMES, ORI_TRAIN_DTYPE, ORI_TEST_NAMES, ORI_TEST_DTYPE, AnalysisCSV, OFFLINE, cal_sim
import dist_utils, ngram_utils
test_file = "./stage1/input/test.csv"
base_feature_save_dir = "./stage1/output/"
if OFFLINE:
    base_prefix = "debug_"
    train_file = "./stage1/input/train_last_50w.csv"
    CHUNK_SIZE = 500000
else:
    base_prefix = "online_"
    train_file = "./stage1/input/train_last_1000w.csv"
    CHUNK_SIZE = 1000000
print(OFFLINE)

True


In [11]:
tmp = np.load("./stage2/input/online_train_concat_feature.npy")

In [2]:
# print(tmp[:5])

In [4]:
! pip install  --index "http://pypi/simple" --trusted-host pypi fuzzywuzzy

Collecting fuzzywuzzy
  Downloading http://pypi/simple/fuzzywuzzy/fuzzywuzzy-0.17.0-py2.py3-none-any.whl
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.17.0


## 文本挖掘特征

In [5]:
from fuzzywuzzy import fuzz
feature_save_dir = base_feature_save_dir + "text_mining_feature/"
train_feature_prefix = base_prefix + "trainTextMining"
test_feature_prefix = base_prefix + "testTextMining"
! ls -alh ./stage1/output/text_mining_feature/

total 945M
drwxr-xr-x 2 kesci root  4.0K Jun 20 16:39 .
drwxr-xr-x 7 kesci root  4.0K Jun 19 09:08 ..
-rw-r--r-- 1 kesci users 4.6M Jun 20 11:29 debug_trainTextMining_len.csv.gz
-rw-r--r-- 1 kesci users  26M Jun 20 11:41 debug_trainTextMining_ngramSim.csv.gz
-rw-r--r-- 1 kesci users  47M Jun 20 13:22 online_testTextMining_len.csv.gz
-rw-r--r-- 1 kesci users 262M Jun 20 16:46 online_testTextMining_ngramSim.csv.gz
-rw-r--r-- 1 kesci users  92M Jun 20 13:19 online_trainTextMining_len.csv.gz
-rw-r--r-- 1 kesci users 515M Jun 20 15:35 online_trainTextMining_ngramSim.csv.gz


提取长度特征

In [6]:
regen_len_feature = False

In [4]:
def run_text_len(df, ngram, prefix):
    # ngram
    if ngram:
        q_list = df['query'].apply(ngram)
        t_list = df['title'].apply(ngram)
    else:
        q_list = df['query']
        t_list = df['title']
    ## 长度特征
    with Timer("extract length"):
        df['%s_q-len' % prefix] = np.vectorize(len)(q_list)
        df['%s_t-len' % prefix] = np.vectorize(len)(t_list)
        df['%s_qt-len-radio'%prefix] = list(map(lambda a, b: a/b, 
            df['%s_q-len' % prefix], 
            df['%s_t-len' % prefix]))
        df['%s_tq-len-radio'%prefix] = list(map(lambda a, b: b/a, 
            df['%s_q-len' % prefix], 
            df['%s_t-len' % prefix]))
    del q_list, t_list
    gc.collect()
    return df
def process_text_len(df, prefix, savefile):
    df = run_text_len(df, None, '%s_%s' % (prefix, 'text'))
    df = run_text_len(df, ngram_utils.unigrams, '%s_%s' % (prefix, 'unigrams'))
    return df

In [5]:
# 提取训练数据特征
if regen_len_feature:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, "len", process_func=process_text_len, 
        names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], drop_last_cols=['query', 'title'])

----->Started 'extract len feature' block...
Mem. usage decreased to 152.59 Mb (0.0% reduction)
----->Started 'extract length' block...
----->Finished 'extract length' block, time used:10.52s.
----->Started 'extract length' block...
----->Finished 'extract length' block, time used:10.98s.
saved len feature to ./stage1/output/text_mining_feature/online_trainTextMining_len.csv.gz
----->Finished 'extract len feature' block, time used:318.4s.


In [15]:
# AnalysisCSV("./stage1/output/text_mining_feature/debug_trainTextMining_feature_len.csv")

In [6]:
if not OFFLINE and regen_len_feature:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, "len", process_func=process_text_len, 
    names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False, chunk_size=CHUNK_SIZE, 
    drop_first_cols=['query_id', 'query_title_id'], drop_last_cols=['query', 'title'])

----->Started 'extract len feature' block...
Mem. usage decreased to 76.29 Mb (0.0% reduction)
----->Started 'extract length' block...
----->Finished 'extract length' block, time used:5.65s.
----->Started 'extract length' block...
----->Finished 'extract length' block, time used:5.59s.
saved len feature to ./stage1/output/text_mining_feature/online_testTextMining_len.csv.gz
----->Finished 'extract len feature' block, time used:165.49s.


In [11]:
# AnalysisCSV("./stage1/output/text_mining_feature/debug_train_text_mining_feature_len.csv")

time: 989 µs


提取集合相似度特征

In [10]:
regen_ngram_similarity_feature = False
similar_arr = [
    dist_utils.dice_ratio,
    # dist_utils.jaccard_ratio,
    dist_utils.edit_seq_ratio,
    dist_utils.edit_set_ratio,
    fuzz.ratio,
    fuzz.partial_ratio,
    fuzz.token_sort_ratio
]

In [8]:
def run_ngram_similarity(df, ngram, prefix):
    # ngram
    if ngram:
        q_list = df['query'].apply(ngram)
        t_list = df['title'].apply(ngram)
    else:
        q_list = df['query']
        t_list = df['title']
    ## 长度特征
    with Timer("extract similarity"):
        for _ in similar_arr:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_{}".format(prefix, name)] = cal_sim(q_list, t_list, _)
    del q_list, t_list
    gc.collect()
    return df

def process_ngram_similarity(df, prefix, savefile):
    df = run_ngram_similarity(df, None, '%s_%s' % (prefix, 'text'))
    df = run_ngram_similarity(df, ngram_utils.unigrams, '%s_%s' % (prefix, 'unigrams'))
    df = run_ngram_similarity(df, ngram_utils.bigrams, '%s_%s' % (prefix, 'bigrams'))
    df = run_ngram_similarity(df, ngram_utils.trigrams, '%s_%s' % (prefix, 'trigrams'))
    return df

In [11]:
# 提取训练数据特征
if regen_ngram_similarity_feature:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, "ngramSim", 
        process_func=process_ngram_similarity, 
        names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], drop_last_cols=['query', 'title'])

----->Started 'extract ngramSim feature' block...
Mem. usage decreased to 152.59 Mb (0.0% reduction)
----->Started 'extract similarity' block...
----->Started 'cal dice_ratio sim' block...
----->Finished 'cal dice_ratio sim' block, time used:35.5s.
----->Started 'cal edit_seq_ratio sim' block...
----->Finished 'cal edit_seq_ratio sim' block, time used:132.57s.
----->Started 'cal edit_set_ratio sim' block...
----->Finished 'cal edit_set_ratio sim' block, time used:176.85s.
----->Started 'cal ratio sim' block...
----->Finished 'cal ratio sim' block, time used:70.39s.
----->Started 'cal partial_ratio sim' block...
----->Finished 'cal partial_ratio sim' block, time used:330.6s.
----->Started 'cal token_sort_ratio sim' block...
----->Finished 'cal token_sort_ratio sim' block, time used:272.39s.
----->Finished 'extract similarity' block, time used:1018.32s.
----->Started 'extract similarity' block...
----->Started 'cal dice_ratio sim' block...
----->Finished 'cal dice_ratio sim' block, time 

In [21]:
# AnalysisCSV("./stage1/output/text_mining_feature/debug_trainTextMining_feature_ngramSimilar.csv")

In [12]:
if not OFFLINE and regen_ngram_similarity_feature:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, "ngramSim", 
    process_func=process_ngram_similarity, 
    names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False, chunk_size=CHUNK_SIZE, 
    drop_first_cols=['query_id', 'query_title_id'], drop_last_cols=['query', 'title'])

----->Started 'extract ngramSim feature' block...
Mem. usage decreased to 76.29 Mb (0.0% reduction)
----->Started 'extract similarity' block...
----->Started 'cal dice_ratio sim' block...
----->Finished 'cal dice_ratio sim' block, time used:17.78s.
----->Started 'cal edit_seq_ratio sim' block...
----->Finished 'cal edit_seq_ratio sim' block, time used:61.66s.
----->Started 'cal edit_set_ratio sim' block...
----->Finished 'cal edit_set_ratio sim' block, time used:90.05s.
----->Started 'cal ratio sim' block...
----->Finished 'cal ratio sim' block, time used:33.92s.
----->Started 'cal partial_ratio sim' block...
----->Finished 'cal partial_ratio sim' block, time used:171.25s.
----->Started 'cal token_sort_ratio sim' block...
----->Finished 'cal token_sort_ratio sim' block, time used:136.22s.
----->Finished 'extract similarity' block, time used:510.89s.
----->Started 'extract similarity' block...
----->Started 'cal dice_ratio sim' block...
----->Finished 'cal dice_ratio sim' block, time us

## 向量空间特征

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pickle
feature_save_dir = base_feature_save_dir + "vector_space/"
train_feature_prefix = base_prefix + "trainVectorSpace"
test_feature_prefix = base_prefix + "testVectorSpace"
! ls -alh ./stage1/output/vector_space/

total 89M
drwxr-xr-x 2 kesci root  4.0K Jun 20 11:43 .
drwxr-xr-x 7 kesci root  4.0K Jun 19 09:08 ..
-rw-r--r-- 1 kesci users  89M Jun 20 11:43 debug_tfidf_model.bin


训练tfidf模型

In [14]:
regen_tfidf_model = True
if OFFLINE:
    tfidf_model_file = feature_save_dir+"debug_tfidf_model.bin"
else:
    tfidf_model_file = feature_save_dir+"online_tfidf_model.bin"
! ls -lsh ./stage1/output/vector_space

total 89M
89M -rw-r--r-- 1 kesci users 89M Jun 20 11:43 debug_tfidf_model.bin


In [15]:
def train_tfidf(df, prefix, savefile):
    word_tfidf = TfidfVectorizer(norm="l2",
                                    strip_accents="unicode",
                                    analyzer="word",
                                    ngram_range=(1, 2),
                                    use_idf=True,
                                    smooth_idf=True,
                                    sublinear_tf=True, min_df=2, max_df=0.9)
    new_query = df["query"].unique()
    new_title = df["title"].values
    print(new_query.shape)
    print(new_title.shape)
    corpus = np.concatenate([new_query, new_title])
    # print(corpus[:10])
    # print(corpus[10:])
    del df
    gc.collect()
    with Timer("tf-idf fit"):
        word_tfidf.fit(corpus)
        with open(tfidf_model_file, "wb") as f:
            pickle.dump(word_tfidf, f)
            print("Model dumped to {}".format(tfidf_model_file))
    return None

In [16]:
# 提取训练数据特征
if regen_tfidf_model:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, "tfidf", 
        process_func=train_tfidf, names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, 
        process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])

----->Started 'extract tfidf feature' block...
Mem. usage decreased to 152.59 Mb (0.0% reduction)
(1512239,)
(10000000,)
----->Started 'tf-idf fit' block...
Model dumped to ./stage1/output/vector_space/online_tfidf_model.bin
----->Finished 'tf-idf fit' block, time used:748.26s.
----->Finished 'extract tfidf feature' block, time used:773.66s.


## 提取word2vec特征

In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
import pickle
feature_save_dir = base_feature_save_dir + "word2vec/"
train_feature_prefix = base_prefix + "trainWord2vec"
test_feature_prefix = base_prefix + "testWord2vec"
! ls -alh ./stage1/output/word2vec

total 874M
drwxr-xr-x 2 kesci root  4.0K Jun 22 01:22 .
drwxr-xr-x 7 kesci root  4.0K Jun 19 09:08 ..
-rw-r--r-- 1 kesci users  29M Jun 22 01:17 debug_trainWord2vec_senVecavgSim.csv.gz
-rw-r--r-- 1 kesci users 282M Jun 21 12:33 online_testWord2vec_senVecavgSim.csv.gz
-rw-r--r-- 1 kesci users 564M Jun 21 09:21 online_trainWord2vec_senVecavgSim.csv.gz


### 生成word2vec句子相似度

In [3]:
regen_word2vec_sen_vec_sim = True
word2vec_model_file = "./stage1/input/word2vec.kv"
tfidf_model_file = "./stage1/output/vector_space/online_tfidf_model.bin"
process_word2vec_sen_vec_mode = "tfidf"
from scipy.spatial.distance import braycurtis, canberra, chebyshev, cityblock, correlation, cosine, euclidean, sqeuclidean
similar_dis = [
    braycurtis, 
    canberra, chebyshev, 
    cityblock, 
    cosine, euclidean, 
    sqeuclidean
]
# similar_radio = [
#     fuzz.ratio,
#     fuzz.partial_ratio,
#     fuzz.token_sort_ratio
# ]
similar_arr = similar_dis


In [28]:
# print(similar_arr)

In [4]:
if regen_word2vec_sen_vec_sim:
    word2vec_model = KeyedVectors.load(word2vec_model_file, mmap='r')
    with open(tfidf_model_file, "rb") as ff:
        tfidf_model = pickle.load(ff)
    tfidf_vocab = tfidf_model.vocabulary_

In [38]:
def getTfidfWeights1(raw_docs):
    global tfidf_model, tfidf_vocab
    with Timer("get tfidf weights"):
        X = tfidf_model.transform(raw_documents=raw_docs)
        print(X)
    #     for i, doc in enumerate(raw_docs):
            
    # res = [{_: X[i][tfidf_vocab[_]] for _ in doc.split() if _ in tfidf_vocab} for i, doc in enumerate(raw_docs)]
    # del X
    # gc.collect()
    # return res

In [40]:
raw_docs = ["20 20 21 123"]
tmp = getTfidfWeights1(raw_docs)

----->Started 'get tfidf weights' block...
  (0, 2881256)	0.2312325554483072
  (0, 2695414)	0.44227262437367776
  (0, 2695257)	0.6890901406254859
  (0, 2692668)	0.43173484618363295
  (0, 798245)	0.29947659627090745
----->Finished 'get tfidf weights' block, time used:0.7s.


In [30]:
def sent2vec(text):
    # 计算embedding均值
    global word2vec_model
    return np.nan_to_num(
          np.array([word2vec_model[w] for w in text.split() if w in word2vec_model] or [0.0] * 200
          ).mean(axis=0)
      )

In [31]:
def sent2vec1(text, weights):
    # 计算embedding带权均值
    global word2vec_model
    return np.nan_to_num(
          np.array([[_*weights[w] for _ in word2vec_model[w] if w in weights] or [0.0] * 200 for w in text.split() if w in word2vec_model] 
          or [0.0] * 200
          ).mean(axis=0)
      )

In [32]:
def process_word2vec_sen_vec(df, prefix, savefile):
    # print(df.shape, df[:2])
    with Timer("cal sent vec"):
        global process_word2vec_sen_vec_mode
        if process_word2vec_sen_vec_mode == "avg":
            q_sen_vec = list(map(sent2vec, df["query"]))
            t_sen_vec = list(map(sent2vec, df["title"]))
        elif process_word2vec_sen_vec_mode == "tfidf":
            with Timer("cal weights"):
                q_sen_weights = getTfidfWeights(df["query"])
                t_sen_weights = getTfidfWeights(df["title"])
            q_sen_vec = list(map(sent2vec1, df["query"], q_sen_weights))
            t_sen_vec = list(map(sent2vec1, df["title"], t_sen_weights))
        # savefile += "{}.npz".format(process_word2vec_sen_vec_mode)
        # np.savez(savefile, q_sent_vec, t_sent_vec)
        # print("Sent vec saved to {}".format(savefile))
        for _ in similar_arr:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_{}".format(prefix, name)] = cal_sim(q_sen_vec, t_sen_vec, _)
    return df

In [24]:
# 提取训练数据特征
if regen_word2vec_sen_vec_sim:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, 
        "senVec{}Sim".format(process_word2vec_sen_vec_mode), 
        process_func=process_word2vec_sen_vec, names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, 
        process_chunkly=True, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id', 'label'], 
        drop_last_cols=['query', 'title'])
# Finished 'extract senVecavgSim feature' block, time used:4160.86s.

In [17]:
# AnalysisCSV("./stage1/output/word2vec/debug_trainWork2vec_senVecavgSim.csv.gz")

In [27]:
# AnalysisCSV("./stage1/input/test.csv")

In [36]:
# 提取训练数据特征
if regen_word2vec_sen_vec_sim and not OFFLINE:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, 
        "senVec{}Sim".format(process_word2vec_sen_vec_mode), 
        process_func=process_word2vec_sen_vec, names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, 
        process_chunkly=True, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_id', 'query_title_id'], 
        drop_last_cols=['query', 'title'])

### Magic特征

In [6]:
feature_save_dir = base_feature_save_dir + "magic/"
train_feature_prefix = base_prefix + "trainMagic"
test_feature_prefix = base_prefix + "testMagic"
! ls -alh ./stage1/output/magic/

total 8.0G
drwxr-xr-x 2 kesci root  4.0K Jun 22 10:24 .
drwxr-xr-x 7 kesci root  4.0K Jun 19 09:08 ..
-rw-r--r-- 1 kesci users 5.8K Jun 22 01:22 debug_trainMagic_ctrRateGlobal.csv.gz
-rw-r--r-- 1 kesci users 122K Jun 22 01:23 debug_trainMagic_ctrRateLocal.csv.gz
-rw-r--r-- 1 kesci users 5.4M Jun 22 05:25 debug_trainMagic_diffSetSim.csv.gz
-rw-r--r-- 1 kesci users  62K Jun 20 12:49 debug_trainMagic_queryFreq.csv.gz
-rw-r--r-- 1 kesci users  95K Jun 21 13:59 online_testMagic_ctrRateGlobal.csv.gz
-rw-r--r-- 1 kesci users 1.2M Jun 21 14:09 online_testMagic_ctrRateLocal.csv.gz
-rw-r--r-- 1 kesci users 931M Jun 22 09:34 online_testMagic_diffSetSimBigramsTitle.npy
-rw-r--r-- 1 kesci users 201M Jun 22 10:29 online_testMagic_diffSetSim.csv.gz
-rw-r--r-- 1 kesci users 1.2G Jun 22 10:01 online_testMagic_diffSetSimTrigramsTitle.npy
-rw-r--r-- 1 kesci users 505M Jun 22 09:13 online_testMagic_diffSetSimUnigramsTitle.npy
-rw-r--r-- 1 kesci users 602K Jun 21 13:27 online_testMagic_queryFr

#### 提取query频率

In [3]:
regen_query_freq = False
train_query_freq = train_feature_prefix + "_queryFreq"
train_freq_save_file = feature_save_dir+train_query_freq+".csv.gz"
if not OFFLINE:
    test_query_freq = test_feature_prefix + "_queryFreq"
    test_freq_save_file = feature_save_dir+test_query_freq+".csv.gz"

In [4]:
def ExtractFreq(sourcefile, savefile, names, dtype, group_by, sort_col, feature_name):
    df = ReadCSV(sourcefile, names=names, dtype=dtype, iterator=False)
    grouped = df.groupby(group_by, as_index=False)[sort_col].count()
    res = np.concatenate(list(map(lambda x: [x]*x, grouped[sort_col])))
    # print(grouped[:3])
    # print(res[:10])
    tmp = pd.DataFrame({
        feature_name: res
    })
    tmp.to_csv(savefile, compression="gzip", index=None)
    print("Feature saved to {}".format(savefile))
    del df, grouped, res, tmp
    gc.collect()

In [5]:
if regen_query_freq:
    ExtractFreq(train_file, 
        train_freq_save_file, 
        ORI_TRAIN_NAMES, 
        ORI_TRAIN_DTYPE,
        "query_id",
        "query_title_id",
        "query_freq")

In [6]:
if regen_query_freq and not OFFLINE:
    ExtractFreq(test_file, 
        test_freq_save_file, 
        ORI_TEST_NAMES, 
        ORI_TEST_DTYPE,
        "query_id",
        "query_title_id",
        "query_freq")

In [24]:
# AnalysisCSV("./stage1/output/magic/debug_trainMagic_titleFreq.csv.gz")

In [25]:
# AnalysisCSV("./stage1/output/magic/debug_trainMagic_queryFreq.csv.gz")

#### 提取差集特征

In [3]:
from fuzzywuzzy import fuzz
similar_arr = [
    dist_utils.dice_ratio,
    # dist_utils.jaccard_ratio,
    dist_utils.edit_seq_ratio,
    dist_utils.edit_set_ratio,
    fuzz.ratio,
    fuzz.partial_ratio,
    fuzz.token_sort_ratio
]
regen_diff_sim = True

In [4]:
def findDiff(arr, ngram):
    set_list = list(map(lambda x: set(ngram(x)), arr))
    res = []
    for i, _ in enumerate(set_list):
        result = _
        for __ in set_list:
            if _ != __:
                result = result.difference(__)
        res.append(list(result))
    return res

In [5]:
def GetDiffSet(query_id, title, ngram):
    tmp, res = [], []
    for i in range(len(query_id)):
      if i == 0:
        tmp.append(title[i])
      else:
        if query_id[i] == query_id[i-1]:
          tmp.append(title[i])
        else:
          res.append(findDiff(tmp, ngram))
          tmp = [title[i]]
    res.append(findDiff(tmp, ngram))
    result = []
    for _ in res:
        result.extend(_)
    return result

In [6]:
def run_diff_set_sim(df, ngram, prefix, savefile, sign):
    with Timer("cal diff set"):
        t_list = GetDiffSet(df["query_id"], df["title"], ngram)
        q_list = df['query'].apply(ngram)
        # 存储处理后的title列表
        t_savefile = savefile + sign +"Title.npy"
        np.save(t_savefile, t_list)
        print("File saved to {}".format(t_savefile))
    ## 长度特征
    with Timer("extract similarity"):
        for _ in similar_arr:
            name = _.__name__
            with Timer("cal {} sim".format(name)):
                df["{}_diffSetSim{}".format(prefix, name)] = cal_sim(q_list, t_list, _)
    del q_list, t_list
    gc.collect()
    return df

def process_diff_set_sim(df, prefix, savefile):
    df = run_diff_set_sim(df, ngram_utils.unigrams, '%s_%s' % (prefix, 'unigrams'), savefile, "Unigrams")
    df = run_diff_set_sim(df, ngram_utils.bigrams, '%s_%s' % (prefix, 'bigrams'), savefile, "Bigrams")
    df = run_diff_set_sim(df, ngram_utils.trigrams, '%s_%s' % (prefix, 'trigrams'), savefile, "Trigrams")
    return df

In [7]:
# 提取训练数据特征
if regen_diff_sim:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, "diffSetSim", 
        process_func=process_diff_set_sim, 
        names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['label', 'query_title_id'], drop_last_cols=['query_id', 'query', 'title'])

----->Started 'extract diffSetSim feature' block...
Mem. usage decreased to 190.73 Mb (0.0% reduction)
----->Started 'cal diff set' block...
File saved to ./stage1/output/magic/online_trainMagic_diffSetSimUnigramsTitle.npy
----->Finished 'cal diff set' block, time used:1102.2s.
----->Started 'extract similarity' block...
----->Started 'cal dice_ratio sim' block...
----->Finished 'cal dice_ratio sim' block, time used:23.49s.
----->Started 'cal edit_seq_ratio sim' block...
----->Finished 'cal edit_seq_ratio sim' block, time used:44.68s.
----->Started 'cal edit_set_ratio sim' block...
----->Finished 'cal edit_set_ratio sim' block, time used:57.86s.
----->Started 'cal ratio sim' block...
----->Finished 'cal ratio sim' block, time used:95.7s.
----->Started 'cal partial_ratio sim' block...
----->Finished 'cal partial_ratio sim' block, time used:496.68s.
----->Started 'cal token_sort_ratio sim' block...
----->Finished 'cal token_sort_ratio sim' block, time used:339.07s.
----->Finished 'extrac

In [19]:
# tmp = np.load("./stage1/output/magic/debug_trainMagic_diffSetSimUnigrams.npy", allow_pickle=True)
# print(tmp.shape, tmp[:2])

(500000,) [list(['1017', '663', '87', '59', '18225', '37136', '3567', '33'])
 list(['14514', '9669', '4126', '15457', '799', '12875', '147', '660'])]


In [24]:
# AnalysisCSV(train_file, print_rows=12)

In [80]:
# AnalysisCSV("./stage1/output/magic/debug_trainMagic_diffSetSim1.csv.gz")

In [None]:
# 提取训练数据特征
if regen_diff_sim:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, "diffSetSim", 
        process_func=process_diff_set_sim, 
        names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False, chunk_size=CHUNK_SIZE, 
        drop_first_cols=['query_title_id'], drop_last_cols=['query_id', 'query', 'title'])

----->Started 'extract diffSetSim feature' block...
Mem. usage decreased to 95.37 Mb (0.0% reduction)
----->Started 'cal diff set' block...
File saved to ./stage1/output/magic/online_testMagic_diffSetSimUnigramsTitle.npy
----->Finished 'cal diff set' block, time used:546.98s.
----->Started 'extract similarity' block...
----->Started 'cal dice_ratio sim' block...
----->Finished 'cal dice_ratio sim' block, time used:11.75s.
----->Started 'cal edit_seq_ratio sim' block...
----->Finished 'cal edit_seq_ratio sim' block, time used:22.84s.
----->Started 'cal edit_set_ratio sim' block...
----->Finished 'cal edit_set_ratio sim' block, time used:29.95s.
----->Started 'cal ratio sim' block...
----->Finished 'cal ratio sim' block, time used:53.76s.
----->Started 'cal partial_ratio sim' block...
----->Finished 'cal partial_ratio sim' block, time used:274.85s.
----->Started 'cal token_sort_ratio sim' block...
----->Finished 'cal token_sort_ratio sim' block, time used:176.2s.
----->Finished 'extract 

In [1]:
# AnalysisCSV("./stage1/output/magic/online_trainMagic_diffSetSim.csv.gz")

#### 点击率特征

In [37]:
regen_ctr_rate = True
stat = {}

In [42]:
def ExtractGlobalCTR(df, prefix, savefile):
    # 全局点击率
    tmp = df["label"].value_counts()
    global_click_rate = tmp[1] / (tmp[0] + tmp[1])
    feature_name = "{}_globalCTR".format(prefix)
    df[feature_name] = [global_click_rate] * df.shape[0]
    if not OFFLINE:
        prefix = prefix.replace("train", "test")
        feature_name = feature_name.replace("train", "test")
        savefile = savefile.replace("train", "test")+".csv.gz"
        test_df = pd.DataFrame({
            feature_name: [global_click_rate] * 5000000
        }).to_csv(savefile, index=None, compression="gzip")
        print("Feature saved to {}".format(savefile))
        del test_df
    del tmp
    gc.collect()
    return df

In [43]:
if regen_ctr_rate:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, "ctrRateGlobal", 
    ExtractGlobalCTR, names=ORI_TRAIN_NAMES,
    dtype=None, process_chunkly=False, 
    drop_first_cols=['query_title_id', 'query', 'title'], drop_last_cols=['query_id', 'label'])

----->Started 'extract ctrRateGlobal feature' block...
Mem. usage decreased to  2.38 Mb (68.7% reduction)
saved ctrRateGlobal feature to ./stage1/output/magic/debug_trainMagic_ctrRateGlobal.csv.gz
----->Finished 'extract ctrRateGlobal feature' block, time used:4.79s.


In [45]:
def ExtractLocalCTR(df, prefix, savefile):
    global stat
    grouped = df.groupby("query_id", as_index=False)
    count = grouped.size().values
    if not stat:
        label = grouped["label"].sum()["label"].values.astype("int32")
        for _ in range(len(count)):
            item_len = count[_]
            if item_len not in stat:
                stat[item_len] = [1, label[_]]
            else:
                stat[item_len][0] += 1
                stat[item_len][1] += label[_]
        del label
        print(stat)
    res = [[ stat[_][1] / (_ * stat[_][0])]*_ for _ in count]
    result = []
    for _ in res:
        result.extend(_)
    df["{}_localCTR".format(prefix)] = result
    del res, grouped
    gc.collect()
    return df

In [46]:
if regen_ctr_rate:
    ExtractFeature(train_file, feature_save_dir, train_feature_prefix, "ctrRateLocal", 
    ExtractLocalCTR, names=ORI_TRAIN_NAMES,
    dtype=None, process_chunkly=False, 
    drop_first_cols=['query_title_id', 'query', 'title'], drop_last_cols=['query_id', 'label'])

----->Started 'extract ctrRateLocal feature' block...
Mem. usage decreased to  2.38 Mb (68.7% reduction)
{11: [1834, 4011], 17: [658, 1727], 13: [1236, 2857], 3: [33728, 44915], 20: [4540, 12425], 4: [5026, 7386], 5: [4753, 7609], 7: [4878, 8538], 6: [4425, 7450], 8: [4544, 8524], 9: [2336, 4671], 12: [1523, 3377], 14: [1023, 2473], 19: [517, 1450], 16: [795, 2057], 15: [886, 2202], 10: [1983, 4125], 18: [577, 1578]}
saved ctrRateLocal feature to ./stage1/output/magic/debug_trainMagic_ctrRateLocal.csv.gz
----->Finished 'extract ctrRateLocal feature' block, time used:5.63s.


In [47]:
if not OFFLINE and regen_ctr_rate:
    ExtractFeature(test_file, feature_save_dir, test_feature_prefix, "ctrRateLocal", 
    ExtractLocalCRT, names=ORI_TEST_NAMES,
    dtype=None, process_chunkly=False, 
    drop_first_cols=['query_title_id', 'query', 'title'], drop_last_cols=['query_id'])