# Baseline

本文档是使用tfIdf+lr并随机采样1000万条数据建立的Baseline模型。

In [14]:
! ls -al

total 64
drwxr-xr-x  5 niudong  staff    160  5 29 15:51 [1m[36m.[m[m
drwxr-xr-x  9 niudong  staff    288  5 29 15:20 [1m[36m..[m[m
-rw-r--r--@ 1 niudong  staff   6148  5 29 15:51 .DS_Store
drwxr-xr-x  3 niudong  staff     96  5 29 14:31 [1m[36m.ipynb_checkpoints[m[m
-rw-r--r--@ 1 niudong  staff  21989  5 29 15:51 baseline.ipynb


## 导入各种库

In [15]:
import pandas as pd
import numpy as np
import time
import random
import gc
import sys

In [16]:
# 是否为线下调试
debug = True

In [17]:
# 数据集参数
if debug:
    !ls -lh ../data
    data_dir = "../data/"
    train_data_file = data_dir + "train_data.csv"
else:
    !ls -lh /home/kesci/input/bytedance/first-round/
    data_dir = "/home/kesci/input/bytedance/first-round/"
    train_data_file = data_dir + "train.csv"
    test_data_file = data_dir + "test.csv"
# csv的header
ori_train_names = ["query_id", "query", "query_title_id", "title", "label"]
ori_test_names = ["query_id", "query", "query_title_id", "title"]
train_names = ["feature", "label"]
test_names = ["query_id", "query_title_id", "feature"]
submit_names = ["query_id", "query_title_id", "label"]

total 9392
-rw-r--r--@ 1 niudong  staff   1.7M  5 29 14:28 train_data.csv
-rw-r--r--@ 1 niudong  staff   1.6M  5 29 15:41 train_processed.csv
-rw-r--r--@ 1 niudong  staff   282K  5 13 12:17 大数据挑战赛_sample.txt


## 辅助函数

In [18]:
# 计算执行某个函数需要的时间
class Timer(object):
    """
    Record the consumed time when ran a code block.
    """
    def __init__(self, block_name, prefix="----->"):
        self.block_name = block_name
        self.prefix = prefix

    def __enter__(self):
        print(self.prefix+"Started '"+self.block_name+"' block...")
        self.time_start = time.time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        elapsed_time = round(time.time() - self.time_start, 2)
        print(self.prefix+"Finished '"+self.block_name+"' block, time used:", str(elapsed_time)+"s.")

In [19]:
# 读取CSV文件
def ReadCSV(filename, names, sep=",", iterator=True):
    # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#pandas.read_csv
    return pd.read_csv(
        filename, 
        names=names,
        sep=sep,
        iterator=iterator
    )

In [20]:
# 批量读入数据，并apply处理函数
def ProcessChunk(filename, func, names, chunk_size=5000000):
    reader = ReadCSV(filename, names)
    while True:
        try:
            print("Reading chunk...")
            func(reader.get_chunk(chunk_size))
        except StopIteration:
            print("Finished process.")
            return
# def handle(x):
#     print(x)
#     print()
# ProcessChunk(train_data_file, handle, names=ori_train_names, chunk_size=5)

In [21]:
# 蓄水池读文件, 本方法巨慢无比!
def ReservoirSample(filename, count, names=ori_train_names):
    print("Reservoir sample...")
    reader = ReadCSV(filename, names)
    try:
        # 对于前面count个元素完全选择
        res = reader.get_chunk(count)
    except StopIteration:
        print("Count exceeds the file size.")
        return res
    i = count
    while True:
        try:
            i += 1
            # https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randint.html
            tmp = random.randint(1, i+1)
            datapoint = reader.get_chunk(1).iloc[0]
            if tmp <= count:
                res.iloc[tmp - 1] = datapoint
        except StopIteration:
            print("Finished sample.")
            return res
# tmp = ReservoirSample(train_data_file, 5)
# print(tmp)

In [22]:
# 按照rate比例从每个chunk中随机采样样本
def RandomSample(filename, rate, chunk_size=1000000, random_state=None, names=ori_train_names):
    print("Random sample...")
    reader = ReadCSV(filename, names)
    chunks = []
    while True:
        try:
            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
            chunks.append(reader.get_chunk(chunk_size).sample(
                n=int(chunk_size*rate), 
                random_state=random_state)
            )
        except StopIteration:
            print("Finished sample.")
            break
    # 删除无用引用, https://blog.csdn.net/jiangjiang_jian/article/details/79140742
    # for x in locals().keys():
    #     if x != "res":
    #         # 会被放入内存池？
    #         del locals()[x]
    # gc.collect()
    return pd.concat(chunks, ignore_index=True)
# tmp = RandomSample(train_data_file, .5, chunk_size=5)
# print(tmp)

In [27]:
# 重新组合特征
def ProcessFeatures(filename, train=True, names=ori_train_names, datadir=data_dir):
    reader = ReadCSV(filename, names)
    f = lambda x: x[1] + " s " + x[3]
    count, chunk_size = 1, 10000000
    print("Start processing...")
    chunks = []
    while True:
        try:
            chunks.append(reader.get_chunk(chunk_size))
        except StopIteration:
            print("Finished process.")
            break
    df = pd.concat(chunks, ignore_index=True)
    if train:
        fname = "train_processed.csv"
        pd.DataFrame({
            "feature": df.apply(f, axis=1), 
            "label": df.label}).to_csv(
            fname, header=None, index=None)
    else:
        fname = "test_processed.csv"
        pd.DataFrame({
            "query_id":df.iloc[:, 0],
            "query_title_id": df.iloc[:, 2],
            "feature": df.apply(f, axis=1)
        }).to_csv(
            fname, header=None, index=None)
    
    del df
    gc.collect()
    
    print("Saved to the file.")

# with Timer("Process feature"):
#     ProcessFeatures(train_data_file)
#     # 线上处理test数据时候需要调用
#     if not debug:
#         ProcessFeatures(test_data_file, train=False, names=ori_test_names)

----->Started 'Process feature' block...
Start processing...
Finished process.
Saved to the file.
----->Finished 'Process feature' block, time used: 0.75s.


## 数据预处理

In [24]:
def pr(o):
    print(o[:5])
    sys.exit(0)

In [28]:
train_file = "train_processed.csv"
test_file = "test_processed.csv"

In [29]:
# 看一下训练数据对不对
ProcessChunk(train_file, pr, names=train_names, chunk_size=500)

Reading chunk...
                                             feature  label
0  1427 5661 29788 1427 387 2299 372 22 1586 1025...      1
1  1427 5661 29788 1427 387 2299 372 22 1586 1025...      0
2  1427 5661 29788 1427 387 2299 372 22 1586 1025...      1
3  1427 5661 29788 1427 361 22 1374 279 1196 27 7...      0
4  1427 5661 29788 1427 361 22 1374 279 1196 27 7...      1


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [30]:
# 看一下测试数据对不对
if not debug:
    ProcessChunk(test_file, pr, names=test_names, chunk_size=500)

In [31]:
# 随机采样1000万数据
with Timer("Random sample"):
    if debug:
        chunk_size = 1000
    else:
        chunk_size = 5000000
    train_random_data = RandomSample(train_file, .1, names=train_names, chunk_size=chunk_size)
    train_data_num = len(train_random_data)

----->Started 'Random sample' block...
Random sample...
Finished sample.
----->Finished 'Random sample' block, time used: 0.12s.


In [32]:
# 全部采样所有的测试数据集
if not debug:
    test_data = ReadCSV(test_file, names=test_names, iterator=False)
    test_data_num = len(test_data)

## 模型

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
from sklearn.naive_bayes import GaussianNB

In [34]:
def tokenizer(text):
    return text.split(" ")

In [35]:
tfidf_model = TfidfVectorizer(
    tokenizer=tokenizer, 
    analyzer="word",
    ngram_range=(1,1))

In [36]:
def getTfidf(docs, train=True):
    model = tfidf_model
    if train:
        X = model.fit_transform(docs)
    else:
        X = model.transform(docs)
    return X

In [37]:
with Timer("Process tfidf"):
    if debug:
        X = getTfidf(train_random_data.feature)
    else:
        X = getTfidf(pd.concat(
                    [train_random_data.feature, 
                    test_data.feature], ignore_index=True))
# 获得训练集的labels
train_labels = train_random_data.label.to_list()

----->Started 'Process tfidf' block...
----->Finished 'Process tfidf' block, time used: 0.07s.


In [38]:
print(X[0])
print(train_labels[:5])

  (0, 1034)	0.613449250305667
  (0, 7040)	0.5819970194722318
  (0, 10465)	0.038785372901981974
  (0, 5721)	0.264114537862141
  (0, 7837)	0.2529566814048837
  (0, 1888)	0.07957795061152707
  (0, 6824)	0.2798406532788586
  (0, 3800)	0.2372305659881661
  (0, 463)	0.09375407621041987
[0, 0, 1, 0, 0]


In [39]:
def lrTrain(train=True):
    
    global X
    
    if train:
        times = 1
    else:
        times = 1
    for i in range(times):
        
        print("Prepare features...")
        train_features = X[:train_data_num]
        test_features = X[train_data_num:]
        
        if train:
            split_train = int(train_data_num*0.7)
            train_X, test_X = train_features[:split_train], train_features[split_train:]
            train_y, test_y = train_labels[:split_train], train_labels[split_train:]
        else:
            train_X, test_X = train_features, test_features
            train_y = train_labels
            
        model = LogisticRegression(C=5, solver="liblinear")
        print("Fitting model...")
        pred = model.fit(train_X, train_y).predict_proba(test_X)[:, 1]
        
        if train:
            fpr, tpr, thresholds = metrics.roc_curve(test_y, pred, pos_label=1)
            res = metrics.auc(fpr, tpr)
            print("AUC:", res)
        else:
            res = []
            for i in range(test_data_num):
                tmp = test_data.iloc[i]
                res.append([tmp[0], tmp[1], pred[i]])
            print(res[:10])
            pd.DataFrame(np.array(res)).to_csv("submit.csv", index=False, header=None)
            
    for x in locals().keys():
        del locals()[x]
    gc.collect()

In [40]:
# 执行训练阶段
with Timer("lr train"):
    lrTrain(True)

----->Started 'lr train' block...
Prepare features...
Fitting model...
AUC: 0.5241347616632861
----->Finished 'lr train' block, time used: 0.08s.


## 提交结果

In [41]:
# 第一次执行需要
# !wget -nv -O kesci_submit https://www.heywhale.com/kesci_submit&&chmod +x kesci_submit
!./kesci_submit -token 490475a1ae106f67 -file submit.csv

/bin/sh: ./kesci_submit: No such file or directory
