# Baseline

本文档是使用tfIdf+lr并随机采样1000万条数据建立的Baseline模型。

In [6]:
! ls -al
! mkdir submits
! mv kesci_submit submits/

total 26837936
drwxr-xr-x 6 kesci root        4096 Jun  5 11:33 .
drwxrwxrwx 8 root  root        4096 Jun  5 11:29 ..
-rw-r--r-- 1 kesci users 1143995991 Jun  2 16:03 fastText_classify.bin
-rw-r--r-- 1 kesci users 9027388177 Jun  2 12:56 labeled_content
drwx------ 2 root  root       16384 May 26 04:29 lost+found
drwxr-xr-x 2 kesci users       4096 Jun  1 12:52 __pycache__
-rw-r--r-- 1 root  root          38 May 26 04:30 .sidecarDownloadOnce
drwxr-xr-x 2 kesci users       4096 Jun  2 07:41 stats
drwxr-xr-x 2 kesci users       4096 Jun  5 11:33 submits
-rw-r--r-- 1 kesci users  455810675 May 28 13:36 test_processed.csv
-rw-r--r-- 1 kesci users 8126569615 Jun  2 13:12 train.txt
-rw-r--r-- 1 kesci users 7827388177 Jun  4 15:31 unlabeled_content
-rw-r--r-- 1 kesci root        2425 Jun  1 12:51 utils.py
-rw-r--r-- 1 kesci users  900818562 Jun  2 13:12 valid.txt
mkdir: cannot create directory ‘submits’: File exists
mv: cannot stat 'kesci_submit': No such file or directory


## 导入各种库

In [2]:
import pandas as pd
import numpy as np
import time
import random
import gc
import sys

In [3]:
# 是否为线下调试
debug = False

In [4]:
# 数据集参数
if debug:
    !ls -lh ../data
    data_dir = "../data/"
    train_data_file = data_dir + "train_data.csv"
else:
    !ls -lh /home/kesci/input/bytedance/first-round/
    data_dir = "/home/kesci/input/bytedance/first-round/"
    train_data_file = data_dir + "train.csv"
    test_data_file = data_dir + "test.csv"
# csv的header
ori_train_names = ["query_id", "query", "query_title_id", "title", "label"]
ori_test_names = ["query_id", "query", "query_title_id", "title"]
train_names = ["feature", "label"]
test_names = ["query_id", "query_title_id", "feature"]
submit_names = ["query_id", "query_title_id", "label"]

total 9.0G
-rw-r--r-- 1 kesci 1000 426M May 22 04:05 test.csv
-rw-r--r-- 1 kesci 1000 8.6G May 16 20:15 train.csv


## 辅助函数

In [5]:
# 计算执行某个函数需要的时间
class Timer(object):
    """
    Record the consumed time when ran a code block.
    """
    def __init__(self, block_name, prefix="----->"):
        self.block_name = block_name
        self.prefix = prefix

    def __enter__(self):
        print(self.prefix+"Started '"+self.block_name+"' block...")
        self.time_start = time.time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        elapsed_time = round(time.time() - self.time_start, 2)
        print(self.prefix+"Finished '"+self.block_name+"' block, time used:", str(elapsed_time)+"s.")

In [6]:
# 读取CSV文件
def ReadCSV(filename, names, sep=",", iterator=True):
    # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#pandas.read_csv
    return pd.read_csv(
        filename, 
        names=names,
        sep=sep,
        iterator=iterator
    )

In [11]:
# 批量读入数据，并apply处理函数
def ProcessChunk(filename, func, names, chunk_size=5000000):
    reader = ReadCSV(filename, names)
    while True:
        try:
            print("Reading chunk...")
            func(reader.get_chunk(chunk_size))
        except StopIteration:
            print("Finished process.")
            return
# def handle(x):
#     print(x)
#     print()
# ProcessChunk(train_data_file, handle, names=ori_train_names, chunk_size=5)

In [12]:
# 蓄水池读文件, 本方法巨慢无比!
def ReservoirSample(filename, count, names=ori_train_names):
    print("Reservoir sample...")
    reader = ReadCSV(filename, names)
    try:
        # 对于前面count个元素完全选择
        res = reader.get_chunk(count)
    except StopIteration:
        print("Count exceeds the file size.")
        return res
    i = count
    while True:
        try:
            i += 1
            # https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randint.html
            tmp = random.randint(1, i+1)
            datapoint = reader.get_chunk(1).iloc[0]
            if tmp <= count:
                res.iloc[tmp - 1] = datapoint
        except StopIteration:
            print("Finished sample.")
            return res
# tmp = ReservoirSample(train_data_file, 5)
# print(tmp)

In [13]:
# 按照rate比例从每个chunk中随机采样样本
def RandomSample(filename, rate, chunk_size=1000000, random_state=None, names=ori_train_names):
    print("Random sample...")
    reader = ReadCSV(filename, names)
    chunks = []
    while True:
        try:
            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
            chunks.append(reader.get_chunk(chunk_size).sample(
                n=int(chunk_size*rate), 
                random_state=random_state)
            )
        except StopIteration:
            print("Finished sample.")
            break
    # 删除无用引用, https://blog.csdn.net/jiangjiang_jian/article/details/79140742
    # for x in locals().keys():
    #     if x != "res":
    #         # 会被放入内存池？
    #         del locals()[x]
    # gc.collect()
    return pd.concat(chunks, ignore_index=True)
# tmp = RandomSample(train_data_file, .5, chunk_size=5)
# print(tmp)

In [14]:
# 重新组合特征
def ProcessFeatures(filename, train=True, names=ori_train_names, datadir=data_dir):
    reader = ReadCSV(filename, names)
    f = lambda x: x[1] + " s " + x[3]
    count, chunk_size = 1, 10000000
    print("Start processing...")
    chunks = []
    while True:
        try:
            chunks.append(reader.get_chunk(chunk_size))
        except StopIteration:
            print("Finished process.")
            break
    df = pd.concat(chunks, ignore_index=True)
    if train:
        fname = "train_processed.csv"
        pd.DataFrame({
            "feature": df.apply(f, axis=1), 
            "label": df.label}).to_csv(
            fname, header=None, index=None)
    else:
        fname = "test_processed.csv"
        pd.DataFrame({
            "query_id":df.iloc[:, 0],
            "query_title_id": df.iloc[:, 2],
            "feature": df.apply(f, axis=1)
        }).to_csv(
            fname, header=None, index=None)
    
    del df
    gc.collect()
    
    print("Saved to the file.")

# with Timer("Process feature"):
#     ProcessFeatures(train_data_file)
#     # 线上处理test数据时候需要调用
#     if not debug:
#         ProcessFeatures(test_data_file, train=False, names=ori_test_names)

## 数据预处理

In [15]:
def pr(o):
    print(o[:5])
    sys.exit(0)

In [8]:
train_file = "train_processed.csv"
test_file = "test_processed.csv"

In [17]:
# 看一下训练数据对不对
ProcessChunk(train_file, pr, names=train_names, chunk_size=500)

Reading chunk...
                                             feature  label
0  1427 5661 29788 1427 387 2299 372 22 1586 1025...      1
1  1427 5661 29788 1427 387 2299 372 22 1586 1025...      0
2  1427 5661 29788 1427 387 2299 372 22 1586 1025...      1
3  1427 5661 29788 1427 361 22 1374 279 1196 27 7...      0
4  1427 5661 29788 1427 361 22 1374 279 1196 27 7...      1


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [18]:
# 看一下测试数据对不对
if not debug:
    ProcessChunk(test_file, pr, names=test_names, chunk_size=500)

Reading chunk...
   query_id  query_title_id                                            feature
0         1               3  11202 184 50256 s 11202 184 2346 2527 274 383 ...
1         1               1  11202 184 50256 s 11202 184 21479 808 383 34 1...
2         1               4  11202 184 50256 s 11202 184 21479 15 227 383 3...
3         1               2  11202 184 50256 s 11202 184 274 383 34 1033 15...
4         2               1  1013 6811 14038 1156 s 361 628 1020 513 126 15...


SystemExit: 0

In [19]:
# 随机采样1000万数据
with Timer("Random sample"):
    if debug:
        chunk_size = 1000
    else:
        chunk_size = 5000000
    train_random_data = RandomSample(train_file, .1, names=train_names, chunk_size=chunk_size)
    train_data_num = len(train_random_data)

Random sample...
Finished sample.


In [10]:
# 全部采样所有的测试数据集
if not debug:
    test_data = ReadCSV(test_file, names=test_names, iterator=False)
    test_data_num = len(test_data)

## 模型

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
from sklearn.naive_bayes import GaussianNB

In [23]:
def tokenizer(text):
    return text.split(" ")

In [24]:
tfidf_model = TfidfVectorizer(
    tokenizer=tokenizer, 
    analyzer="word",
    ngram_range=(1,1))

In [25]:
def getTfidf(docs, train=True):
    model = tfidf_model
    if train:
        X = model.fit_transform(docs)
    else:
        X = model.transform(docs)
    return X

In [26]:
with Timer("Process tfidf"):
    if debug:
        X = getTfidf(train_random_data.feature)
    else:
        X = getTfidf(pd.concat(
                    [train_random_data.feature, 
                    test_data.feature], ignore_index=True))
# 获得训练集的labels
train_labels = train_random_data.label.to_list()

----->Started 'Process tfidf' block...
----->Finished 'Process tfidf' block, time used: 294.24s.


In [27]:
print(X[0])
print(train_labels[:5])

  (0, 68333)	0.6124487441553906
  (0, 587607)	0.39283261251154483
  (0, 201393)	0.34001356533545896
  (0, 751798)	0.023164603119026277
  (0, 58011)	0.1267661282622556
  (0, 348180)	0.18883673981744695
  (0, 148526)	0.13439756192819877
  (0, 25349)	0.060688019472363484
  (0, 154437)	0.157491915288034
  (0, 90847)	0.22868015706451533
  (0, 169005)	0.15909368115083394
  (0, 176935)	0.16504170475470367
  (0, 672644)	0.2116534036060197
  (0, 745365)	0.32637938105251185
[0, 0, 1, 1, 0]


In [31]:
def lrTrain(train=True, savefile="submit.csv"):
    
    global X
    
    if train:
        times = 1
    else:
        times = 1
    for i in range(times):
        
        print("Prepare features...")
        train_features = X[:train_data_num]
        test_features = X[train_data_num:]
        
        if train:
            split_train = int(train_data_num*0.7)
            train_X, test_X = train_features[:split_train], train_features[split_train:]
            train_y, test_y = train_labels[:split_train], train_labels[split_train:]
        else:
            train_X, test_X = train_features, test_features
            train_y = train_labels
            
        model = LogisticRegression(C=1, solver="liblinear")
        print("Fitting model...")
        pred = model.fit(train_X, train_y).predict_proba(test_X)[:, 1]
        
        if train:
            fpr, tpr, thresholds = metrics.roc_curve(test_y, pred, pos_label=1)
            res = metrics.auc(fpr, tpr)
            print("AUC:", res)
        else:
            res = []
            for i in range(test_data_num):
                tmp = test_data.iloc[i]
                res.append([tmp[0], tmp[1], pred[i]])
            print(res[:10])
            pd.DataFrame(np.array(res)).to_csv(savefile, index=False, header=None)
            
    for x in locals().keys():
        del locals()[x]
    gc.collect()

In [35]:
# 执行训练阶段
with Timer("lr train"):
    lrTrain(False, "submit1.csv")

----->Started 'lr train' block...
Prepare features...
Fitting model...
[[1, 3, 0.2428209424290729], [1, 1, 0.24676146383766], [1, 4, 0.2325881796989098], [1, 2, 0.23973552429059442], [2, 1, 0.3223512759810728], [2, 2, 0.3023735898439416], [2, 3, 0.35346710014962174], [3, 6, 0.05916494239325773], [3, 5, 0.10308273138490058], [3, 8, 0.11302303084530227]]
----->Finished 'lr train' block, time used: 2421.55s.


## 提交结果

In [36]:
!ls -al

total 16839588
drwxr-xr-x 3 kesci root        4096 May 29 09:14 .
drwxrwxrwx 8 root  root        4096 May 29 08:34 ..
-rwxr-xr-x 1 kesci users    7842088 May 25 15:15 kesci_submit
drwx------ 2 root  root       16384 May 26 04:29 lost+found
-rw-r--r-- 1 root  root          38 May 26 04:30 .sidecarDownloadOnce
-rw-r--r-- 1 kesci users  162605370 May 29 09:14 submit1.csv
-rw-r--r-- 1 kesci users  162604756 May 28 17:25 submit.csv
-rw-r--r-- 1 kesci users  455810675 May 28 13:36 test_processed.csv
-rw-r--r-- 1 kesci users  820818562 May 28 09:32 train_10.csv
-rw-r--r-- 1 kesci users  819693364 May 28 08:38 train_1.csv
-rw-r--r-- 1 kesci users  817448885 May 28 08:44 train_2.csv
-rw-r--r-- 1 kesci users  824399775 May 28 08:50 train_3.csv
-rw-r--r-- 1 kesci users  827260894 May 28 08:56 train_4.csv
-rw-r--r-- 1 kesci users  826309777 May 28 09:02 train_5.csv
-rw-r--r-- 1 kesci users  825891483 May 28 09:08 train_6.csv
-rw-r--r-- 1 kesci users  824105834 May 28 09:14 train_7.

In [22]:
res1 = pd.DataFrame({
    "query_id": test_data["query_id"],
    "query_title_id": test_data["query_title_id"],
    "label": 0.5
})
res1.to_csv("submit_baseline_0.5.csv", header=None, index=None)

In [23]:
submit_baseline_1 = ReadCSV("submit_baseline_0.5.csv", names=submit_names, iterator=False)
print(submit_baseline_1[:5])

   query_id  query_title_id  label
0         1               3    0.5
1         1               1    0.5
2         1               4    0.5
3         1               2    0.5
4         2               1    0.5


In [24]:
# 第一次执行需要
# !wget -nv -O kesci_submit https://www.heywhale.com/kesci_submit&&chmod +x kesci_submit
!./kesci_submit -token 490475a1ae106f67 -file submit_baseline_0.5.csv

Kesci Submit Tool
Result File: submit_baseline_0.5.csv (61.91 MiB)
Uploaded.       
Submit Success.
{"Stage":0,"Status":0,"ShownInHistory":true,"IsAucResult":true,"Selected":false,"_id":"5cf1218ef649f6002b61e431","Competition":"5cc51043f71088002c5b8840","Team":"5cda7d99b4de58002b94c3f8","UploadDate":"2019-05-31T12:43:58.536Z","Final":true,"Response":"","SubmissionResults":[],"IP":"52.83.252.251","FingerPrint":"","UserAgent":"Go-http-client/1.1","ResultFileName":"1559306632859435da4.csv","ResultFileRealName":"submit_baseline_0.5.csv","ResultFileSize":0,"ReviewInfos":[],"__v":0}

