In [1]:
from pathlib import Path
import re
import warnings

import numpy as np

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    import pandas as pd
    
    from scipy import sparse
    
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.model_selection import StratifiedKFold, cross_validate
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.preprocessing import LabelBinarizer

import mailparse

np.random.seed(42)

In [2]:
def print_score(cv_result):
    """スコア表示."""
    print('fit time:  ', cv_result['fit_time'])
    print('recall:    ', cv_result['test_r'], np.mean(cv_result['test_r']))
    print('precision: ', cv_result['test_p'], np.mean(cv_result['test_p']))
    print('f1:        ', cv_result['test_f'], np.mean(cv_result['test_f']))


def load_mails(data_type):
    """mailを読み込む."""
    mails = []
    for path in sorted(Path('.').glob('./{0}2/{0}_*.txt'.format(data_type))):
        with open(path, 'r') as f:
            mail = mailparse.parse(f.readlines())
            mail['file_name'] = path.name
        mails.append(mail)
    return mails

In [3]:
body_vec = CountVectorizer(
    stop_words='english',
)

subject_vec = CountVectorizer(
    stop_words='english',
)

attachment_vec = LabelBinarizer()

In [4]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
cv_metrics = {'p': 'precision', 'r': 'recall', 'f': 'f1'}

In [5]:
train_master = pd.read_table('train_master.tsv')
train_datum = pd.merge(
    train_master,
    pd.DataFrame.from_dict(load_mails('train')),
    on='file_name',
    how='left'
)

In [6]:
train_datum.head()

Unnamed: 0,file_name,label,attachment_ext,body,body_length,subject,subject_length
0,train_0000.txt,0,no attachment,the equistar deal 156337 is the sale and swing...,199,re : buyback / deficiency deals worksheet\n,7
1,train_0001.txt,0,exe,- - - - - original message - - - - -\nfrom : a...,129,fw : stress relief\n,4
2,train_0002.txt,1,no attachment,"dear friend ,\nplease don ' t be surprised to ...",533,from mrs . juliana\n,4
3,train_0003.txt,1,no attachment,our offer :\nwindows xp pro office xp professi...,96,[ wrenches ] 68 % off dreamweaver mx 2004 flie...,11
4,train_0004.txt,0,no attachment,name home pager\ngeorge grant 281 - 282 - 9084...,123,y 2 k - texas log\n,6


In [6]:
body_features = body_vec.fit_transform(train_datum.body.values)
body_length = train_datum.body_length.values.reshape(-1, 1)

subject_features = subject_vec.fit_transform(train_datum.subject.values)
subject_length = train_datum.subject_length.values.reshape(-1, 1)

attachment_ext_features = attachment_vec.fit_transform(train_datum.attachment_ext.values)


train_X = sparse.hstack(
    (
        body_features,
        subject_features,
        body_length,
        subject_length,
        attachment_ext_features,
    ), 'csr')
train_y = train_datum.label.values

### そのまま単純ベイズ(一番良さそう)

In [7]:
mnb = MultinomialNB()

In [9]:
result = cross_validate(
    X=train_X,
    y=train_y,
    estimator=mnb,
    scoring=cv_metrics,
    cv=skf
)
print_score(result)

fit time:   [0.03351665 0.01115704 0.01160693 0.00970101 0.0108645 ]
recall:     [0.96666667 0.95333333 0.97315436 0.95973154 0.95973154] 0.962523489932886
precision:  [0.96666667 0.97278912 0.97315436 0.9862069  0.99305556] 0.9783745193672624
f1:         [0.96666667 0.96296296 0.97315436 0.97278912 0.97610922] 0.9703364645418121


### パラメータ最適化

In [10]:
from sklearn.model_selection import cross_val_score

from skopt import gp_minimize
from skopt.space import Categorical
from skopt.space import Real
from skopt.space import Space

def objective(params):
    alpha, fit_prior = params
    mnb.set_params(alpha=alpha, fit_prior=fit_prior)
    return -np.mean(
        cross_val_score(
            estimator=mnb,
            X=train_X,
            y=train_y,
            cv=skf,
            scoring='f1'
        ))

In [11]:
space = Space(
    [
        Real(low=10**-5, high=10**5, prior='log-uniform', name='alpha'),
        Categorical(categories=[True, False], name='fit_prior')
    ]
)
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    local_opt_params = gp_minimize(
        func=objective, dimensions=space, n_calls=100, n_jobs=-1)
print(local_opt_params.x)

[0.06174894983917462, False]


In [12]:
print(local_opt_params.fun)

-0.9802887842939411


In [8]:
mnb.set_params(alpha=0.06174894983917462, fit_prior=False)

MultinomialNB(alpha=0.06174894983917462, class_prior=None, fit_prior=False)

In [14]:
result = cross_validate(
    X=train_X,
    y=train_y,
    estimator=mnb,
    scoring=cv_metrics,
    cv=skf
)
print_score(result)

fit time:   [0.03855538 0.01277256 0.012007   0.01228666 0.0089612 ]
recall:     [0.99333333 1.         0.99328859 1.         0.99328859] 0.9959821029082774
precision:  [0.94904459 0.96774194 0.96732026 0.94904459 0.97368421] 0.9613671158845236
f1:         [0.97068404 0.98360656 0.98013245 0.97385621 0.9833887 ] 0.9783335920530772


In [9]:
mnb.fit(X=train_X, y=train_y)

MultinomialNB(alpha=0.06174894983917462, class_prior=None, fit_prior=False)

In [10]:
test_datum = pd.DataFrame.from_dict(load_mails('test'))

In [11]:
test_body_features = body_vec.transform(test_datum.body.values)
test_body_length = test_datum.body_length.values.reshape(-1, 1)

test_subject_features = subject_vec.transform(test_datum.subject.values)
test_subject_length = test_datum.subject_length.values.reshape(-1, 1)

test_attachment_ext_features = attachment_vec.transform(test_datum.attachment_ext.values)

test_X = sparse.hstack(
    (
        test_body_features,
        test_subject_features,
        test_body_length,
        test_subject_length,
        test_attachment_ext_features,
    ), 'csr')

In [12]:
predict_y = mnb.predict(test_X)

In [13]:
submit_table = pd.concat(
    [test_datum.file_name, pd.Series(predict_y).rename('predict')],
    axis=1
)
submit_table.to_csv('submit.csv', index=False, header=False)

### まとめ

#### 特徴量

以下を結合した特徴ベクトルを使用した。

- 件名（BoW）
- 本文（BoW）
- 件名長（単語数）
- 本文長（単語数）
- 添付ファイル拡張子（one-hot）

今回のタスクにおいては n-gram 表現は精度向上に寄与しなかった。

#### 前処理

stopword による不要語除去のみ行った。
以下を試みたが精度向上は見られなかった。

- 数値表現のシンボル化
- 特徴選択（ロジスティクス回帰, RandomForest）
- SVDによる次元圧縮

#### 学習アルゴリズム

多項モデル単純ベイズ分類器による分類が一番高い精度を見せた。
NBSVM, SVM, RandomForest, LightGBMに依る分類も試みたが思ったとおりの精度が出なかった。
（もしかしたら特徴量の標準化を行えば精度が出たかも知れない）