In [1]:
# 查看当前挂载的数据集目录
!ls /home/kesci/input/

bytedance


In [2]:
# 查看数据集
!ls -lh /home/kesci/input/bytedance/first-round/

total 9.0G
-rw-r--r-- 1 kesci 1000 426M May 22 04:05 test.csv
-rw-r--r-- 1 kesci 1000 8.6G May 16 20:15 train.csv


In [3]:
# 查看个人持久化工作区文件
!ls /home/kesci/work/

kesci_submit	    train_10.csv  train_3.csv  train_6.csv  train_9.csv
lost+found	    train_1.csv   train_4.csv  train_7.csv  train_processed.csv
test_processed.csv  train_2.csv   train_5.csv  train_8.csv


In [4]:
# 查看当前kernerl下的package
# !pip list --format=columns

In [5]:
# 显示cell运行时长
%load_ext klab-autotime

## 导入各种库

In [3]:
import pandas as pd
import numpy as np
import time
import random
import gc
import sys

In [18]:
!ls -lh /home/kesci/input/bytedance/first-round/
data_dir = "/home/kesci/input/bytedance/first-round/"
train_data_file = data_dir + "train.csv"
test_data_file = data_dir + "test.csv"
ori_train_names = ["query_id", "query", "query_title_id", "title", "label"]
ori_test_names = ["query_id", "query", "query_title_id", "title"]

total 9.0G
-rw-r--r-- 1 kesci 1000 426M May 22 04:05 test.csv
-rw-r--r-- 1 kesci 1000 8.6G May 16 20:15 train.csv
time: 428 ms


## 辅助函数

In [19]:
# 快速打印
def pr(o):
    print(o)
    print()

time: 670 µs


In [4]:
# 读取CSV文件
def ReadCSV(filename, names, sep=",", iterator=True):
    # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#pandas.read_csv
    return pd.read_csv(
        filename, 
        names=names,
        sep=sep,
        iterator=iterator
    )

In [21]:
# 批量读入数据，并apply处理函数
def ProcessChunk(filename, func, names, chunk_size=5000000):
    reader = ReadCSV(filename, names)
    while True:
        try:
            print("Reading chunk...")
            func(reader.get_chunk(chunk_size))
        except StopIteration:
            print("Finished process.")
            return
# def handle(x):
#     print(x)
#     print()
# ProcessChunk(train_data_file, handle, names=ori_train_names, chunk_size=5)

time: 1.92 ms


In [22]:
# 蓄水池读文件, 本方法巨慢无比!
def ReservoirSample(filename, count, names=ori_train_names):
    print("Reservoir sample...")
    reader = ReadCSV(filename, names)
    try:
        # 对于前面count个元素完全选择
        res = reader.get_chunk(count)
    except StopIteration:
        print("Count exceeds the file size.")
        return res
    i = count
    while True:
        try:
            i += 1
            # https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randint.html
            tmp = random.randint(1, i+1)
            datapoint = reader.get_chunk(1).iloc[0]
            if tmp <= count:
                res.iloc[tmp - 1] = datapoint
        except StopIteration:
            print("Finished sample.")
            return res
# tmp = ReservoirSample(train_data_file, 5)
# print(tmp)

time: 1.75 ms


In [23]:
# 按照rate比例从每个chunk中随机采样样本
def RandomSample(filename, rate, chunk_size=1000000, random_state=None, names=ori_train_names):
    print("Random sample...")
    reader = ReadCSV(filename, names)
    chunks = []
    while True:
        try:
            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
            chunks.append(reader.get_chunk(chunk_size).sample(
                n=int(chunk_size*rate), 
                random_state=random_state)
            )
        except StopIteration:
            print("Finished sample.")
            break
    # 删除无用引用, https://blog.csdn.net/jiangjiang_jian/article/details/79140742
    # for x in locals().keys():
    #     if x != "res":
    #         # 会被放入内存池？
    #         del locals()[x]
    # gc.collect()
    return pd.concat(chunks, ignore_index=True)
# tmp = RandomSample(train_data_file, .5, chunk_size=5)
# print(tmp)

time: 2.14 ms


In [24]:
# 重新组合特征
def ProcessFeatures(filename, train=True, names=ori_train_names):
    reader = ReadCSV(filename, names)
    f = lambda x: x[1] + " s " + x[3]
    count, chunk_size = 1, 10000000
    print("Start processing...")
    chunks = []
    while True:
        try:
            chunks.append(reader.get_chunk(chunk_size))
        except StopIteration:
            print("Finished process.")
            break
    df = pd.concat(chunks, ignore_index=True)
    if train:
        fname = "train_processed.csv"
        pd.DataFrame({
            "feature": df.apply(f, axis=1), 
            "label": df.label}).to_csv(
            fname, header=None, index=None)
    else:
        fname = "test_processed.csv"
        pd.DataFrame({
            "query_id":df.iloc[:, 0],
            "query_title_id": df.iloc[:, 2],
            "feature": df.apply(f, axis=1)
        }).to_csv(
            fname, header=None, index=None)
    
    del df
    gc.collect()
    
    print("Saved to the file.")

# ProcessFeatures(train_data_file)
# ProcessFeatures(test_data_file, train=False, names=ori_test_names)

time: 3.89 ms


In [7]:
train_file = "train_processed.csv"
test_file = "test_processed.csv"
train_names = ["feature", "label"]
test_names = ["query_id", "query_title_id", "feature"]
submit_names = ["query_id", "query_title_id", "label"]
!ls -al

total 16680792
drwxr-xr-x 3 kesci root        4096 May 28 17:25 .
drwxrwxrwx 7 root  root        4096 May 29 00:35 ..
-rwxr-xr-x 1 kesci users    7842088 May 25 15:15 kesci_submit
drwx------ 2 root  root       16384 May 26 04:29 lost+found
-rw-r--r-- 1 root  root          38 May 26 04:30 .sidecarDownloadOnce
-rw-r--r-- 1 kesci users  162604756 May 28 17:25 submit.csv
-rw-r--r-- 1 kesci users  455810675 May 28 13:36 test_processed.csv
-rw-r--r-- 1 kesci users  820818562 May 28 09:32 train_10.csv
-rw-r--r-- 1 kesci users  819693364 May 28 08:38 train_1.csv
-rw-r--r-- 1 kesci users  817448885 May 28 08:44 train_2.csv
-rw-r--r-- 1 kesci users  824399775 May 28 08:50 train_3.csv
-rw-r--r-- 1 kesci users  827260894 May 28 08:56 train_4.csv
-rw-r--r-- 1 kesci users  826309777 May 28 09:02 train_5.csv
-rw-r--r-- 1 kesci users  825891483 May 28 09:08 train_6.csv
-rw-r--r-- 1 kesci users  824105834 May 28 09:14 train_7.csv
-rw-r--r-- 1 kesci users  821288738 May 28 09:20 train_8.

In [26]:
def pc(o):
    print(o[:5])
    sys.exit(0)

time: 829 µs


In [27]:
# 看一下训练数据对不对
ProcessChunk(train_file, pc, names=train_names, chunk_size=500)

Reading chunk...
                                             feature  label
0  1427 5661 29788 1427 387 2299 372 22 1586 1025...      1
1  1427 5661 29788 1427 387 2299 372 22 1586 1025...      0
2  1427 5661 29788 1427 387 2299 372 22 1586 1025...      1
3  1427 5661 29788 1427 361 22 1374 279 1196 27 7...      0
4  1427 5661 29788 1427 361 22 1374 279 1196 27 7...      1


SystemExit: 0

time: 15.3 ms


In [28]:
# 看一下测试数据对不对
ProcessChunk(test_file, pc, names=test_names, chunk_size=500)

Reading chunk...
   query_id  query_title_id                                            feature
0         1               3  11202 184 50256 s 11202 184 2346 2527 274 383 ...
1         1               1  11202 184 50256 s 11202 184 21479 808 383 34 1...
2         1               4  11202 184 50256 s 11202 184 21479 15 227 383 3...
3         1               2  11202 184 50256 s 11202 184 274 383 34 1033 15...
4         2               1  1013 6811 14038 1156 s 361 628 1020 513 126 15...


SystemExit: 0

time: 20 ms


## 数据预处理

In [None]:
# train_data.label.describe()
# num_label_1 = len(train_data[train_data.label == 1])
# num_label_0 = len(train_data[train_data.label == 0])
# print("Number of label == 1:", num_label_1)
# print("Number of label == 0:", num_label_0)

In [29]:
# 随机采样1000万数据
train_random_data = RandomSample(train_file, .1, names=train_names, chunk_size=5000000)

Random sample...
Finished sample.
time: 3min 1s


In [30]:
# 全部采样所有的测试数据集
test_data = ReadCSV(test_file, names=test_names, iterator=False)

time: 11.9 s


In [31]:
train_data_num = len(train_random_data)
test_data_num = len(test_data)
print(train_data_num, test_data_num)
print(train_random_data[:10])
print(test_data[:10])

10000000 5000000
                                             feature  label
0  874 8 54 20737 54679 46081 s 874 36 2274 24 20...      0
1  6139 26927 41527 s 1301 6139 49504 585 5043 27...      0
2  1257 944 8071 s 2228 13641 15 419 36 195 502 1...      0
3  325 296093 1332 9776 s 4156 2216 7951 37 8135 ...      0
4  119694 53 48 250 s 698 119694 15 53 10 698 217...      0
5  47973 27108 2727 s 309 3192 389 79 103 256 155...      0
6  974060 532814 s 62 20198 1470 36 209194 206990...      1
7  2097 687 1005 3264 16141 32644 4590 423 627 s ...      0
8           46 478 7231 s 486 28 27 12 82 16 478 602      0
9  96 53384 s 96 357 50212 7939 953 14078 27 96 5...      0
   query_id  query_title_id                                            feature
0         1               3  11202 184 50256 s 11202 184 2346 2527 274 383 ...
1         1               1  11202 184 50256 s 11202 184 21479 808 383 34 1...
2         1               4  11202 184 50256 s 11202 184 21479 15 227 383 3...
3      

## 模型

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
from sklearn.naive_bayes import GaussianNB

time: 956 ms


In [33]:
def tokenizer(text):
  return text.split(" ")

time: 983 µs


In [34]:
tfidf_model = TfidfVectorizer(
    tokenizer=tokenizer, 
    analyzer="word",
    ngram_range=(1,1))

time: 7.58 ms


In [35]:
def getTfidf(docs, train=True):
    model = tfidf_model
    if train:
      X = model.fit_transform(docs)
    else:
      X = model.transform(docs)
    return X

time: 1.62 ms


In [None]:
# def nbTrain(df):
#     print("Training...")
#     X = df.feature.map(lambda x: np.array(x.split(" "))).to_numpy()
#     y = df.label.to_numpy()
#     print(X[:5])
#     print(y[:5])
#     nb_clf.partial_fit(X, y)
# chunkHandle("./train_processed.csvnames=["feature", "label"], func=nbTrain, chunk_size=10)

In [36]:
X = getTfidf(pd.concat(
            [train_random_data.feature, 
            test_data.feature], ignore_index=True))

time: 5min 41s


In [37]:
train_labels = train_random_data.label.to_list()

time: 175 ms


In [38]:
print(X[0])
print(train_labels[:5])

  (0, 712992)	0.2873778372856431
  (0, 684916)	0.10008331735207013
  (0, 538213)	0.10971830232974773
  (0, 193073)	0.4769898120143884
  (0, 543286)	0.5164040257318053
  (0, 473279)	0.2626808054323113
  (0, 752339)	0.023854450297103133
  (0, 375202)	0.06298545607809798
  (0, 219404)	0.18871640017748803
  (0, 235569)	0.09147385424625833
  (0, 222793)	0.09254083735601909
  (0, 534403)	0.1582716777842799
  (0, 726996)	0.17268244494339047
  (0, 55895)	0.13877930212603423
  (0, 684396)	0.23094166139323344
  (0, 674031)	0.16667534925048588
  (0, 272795)	0.03690570783929572
  (0, 125381)	0.14187660540414757
  (0, 528107)	0.2118296051490332
  (0, 54980)	0.23125287231441868
[0, 0, 0, 0, 0]
time: 2.35 ms


In [39]:
def lrTrain(train=True):
    
    global X
    
    if train:
        times = 1
    else:
        times = 1
    for i in range(times):
        
        print("Prepare features...")
        train_features = X[:train_data_num]
        test_features = X[train_data_num:]
        
        if train:
            split_train = int(train_data_num*0.7)
            train_X, test_X = train_features[:split_train], train_features[split_train:]
            train_y, test_y = train_labels[:split_train], train_labels[split_train:]
        else:
            train_X, test_X = train_features, test_features
            train_y = train_labels
            
        model = LogisticRegression(C=5, solver="liblinear")
        print("Fitting model...")
        pred = model.fit(train_X, train_y).predict_proba(test_X)[:, 1]
        
        if train:
            fpr, tpr, thresholds = metrics.roc_curve(test_y, pred, pos_label=1)
            res = metrics.auc(fpr, tpr)
            print("AUC:", res)
        else:
            res = []
            for i in range(test_data_num):
                tmp = test_data.iloc[i]
                res.append([tmp[0], tmp[1], pred[i]])
            print(res[:10])
            pd.DataFrame(np.array(res)).to_csv("submit.csv", index=False, header=None)
            
    for x in locals().keys():
        del locals()[x]
    gc.collect()

time: 5.69 ms


In [40]:
start_time = time.time()
lrTrain(False)
end_time = time.time()
print("Cost time:", end_time-start_time)

Prepare features...
Fitting model...
[[1, 3, 0.24107588031361993], [1, 1, 0.23910313117610016], [1, 4, 0.23452963764325457], [1, 2, 0.24272918398353285], [2, 1, 0.33945773901157755], [2, 2, 0.31615734778734095], [2, 3, 0.33748379548334057], [3, 6, 0.06051411402195174], [3, 5, 0.1143809328712366], [3, 8, 0.111900702631097]]
Cost time: 5115.495464801788
time: 1h 25min 15s


In [1]:
!ls -al

total 16680792
drwxr-xr-x 3 kesci root        4096 May 28 17:25 .
drwxrwxrwx 6 root  root        4096 May 29 00:33 ..
-rwxr-xr-x 1 kesci users    7842088 May 25 15:15 kesci_submit
drwx------ 2 root  root       16384 May 26 04:29 lost+found
-rw-r--r-- 1 root  root          38 May 26 04:30 .sidecarDownloadOnce
-rw-r--r-- 1 kesci users  162604756 May 28 17:25 submit.csv
-rw-r--r-- 1 kesci users  455810675 May 28 13:36 test_processed.csv
-rw-r--r-- 1 kesci users  820818562 May 28 09:32 train_10.csv
-rw-r--r-- 1 kesci users  819693364 May 28 08:38 train_1.csv
-rw-r--r-- 1 kesci users  817448885 May 28 08:44 train_2.csv
-rw-r--r-- 1 kesci users  824399775 May 28 08:50 train_3.csv
-rw-r--r-- 1 kesci users  827260894 May 28 08:56 train_4.csv
-rw-r--r-- 1 kesci users  826309777 May 28 09:02 train_5.csv
-rw-r--r-- 1 kesci users  825891483 May 28 09:08 train_6.csv
-rw-r--r-- 1 kesci users  824105834 May 28 09:14 train_7.csv
-rw-r--r-- 1 kesci users  821288738 May 28 09:20 train_8.

In [8]:
df = ReadCSV("submit.csv", names=submit_names, iterator=False)

In [14]:
# print(df.describe())

           query_id  query_title_id         label
count  5.000000e+06    5.000000e+06  5.000000e+06
mean   4.900062e+05    4.316898e+00  2.582458e-01
std    2.826971e+05    3.747229e+00  8.450985e-02
min    1.000000e+00    1.000000e+00  7.327361e-03
25%    2.453730e+05    2.000000e+00  2.015213e-01
50%    4.900410e+05    3.000000e+00  2.512828e-01
75%    7.347750e+05    6.000000e+00  3.063874e-01
max    9.795640e+05    2.000000e+01  9.180106e-01


In [15]:
!wget -nv -O kesci_submit https://www.heywhale.com/kesci_submit&&chmod +x kesci_submit

wget: /opt/conda/lib/libcrypto.so.1.0.0: no version information available (required by wget)
wget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)
wget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)
2019-05-29 00:40:19 URL:https://www.heywhale.com/kesci_submit [7842088/7842088] -> "kesci_submit" [1]


In [16]:
!./kesci_submit -token 490475a1ae106f67 -file submit.csv

Kesci Submit Tool
Result File: submit.csv (155.07 MiB)
Uploaded.       
Submit Success.
{"Stage":0,"Status":0,"ShownInHistory":true,"IsAucResult":true,"Selected":false,"_id":"5cedd5383ad144002b5948a8","Competition":"5cc51043f71088002c5b8840","Team":"5cda7d99b4de58002b94c3f8","UploadDate":"2019-05-29T00:41:28.021Z","Final":true,"Response":"","SubmissionResults":[],"IP":"52.83.230.159","FingerPrint":"","UserAgent":"Go-http-client/1.1","ResultFileName":"1559090468906e8a90c.csv","ResultFileRealName":"submit.csv","ResultFileSize":0,"ReviewInfos":[],"__v":0}

