In [2]:
from pathlib import Path
import re

import numpy as np
import pandas as pd

subject_pattern = re.compile(r'^Subject: ')
number_pattern = re.compile(r'[+-]?(?:\d+\.?\d*|\.\d+)')

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
np.random.seed(42)

In [4]:
def replace_number_to_symbol(string):
    return number_pattern.sub('__NUMBER__', string)

In [5]:
def print_score(cv_result):
    print('fit time:  ', cv_result['fit_time'])
    print('recall:    ', cv_result['test_r'], np.mean(cv_result['test_r']))
    print('precision: ', cv_result['test_p'], np.mean(cv_result['test_p']))
    print('f1:        ', cv_result['test_f'], np.mean(cv_result['test_f']))

In [6]:
train_master = pd.read_table('train_master.tsv')

In [7]:
mails = []
for path in sorted(Path('.').glob('./train2/train_*.txt')):
    with open(path, 'r') as f:
        subject = subject_pattern.sub(
            '', f.readline().strip()),
        body = f.read()
    mails.append(
        {
            'file_name': path.name,
            'subject': subject[0],
            'body': body
        }
    )

In [8]:
train_datum = pd.merge(
    train_master,
    pd.DataFrame.from_dict(mails),
    on='file_name',
    how='left'
)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

In [10]:
body_vec = CountVectorizer(
    stop_words='english',
    preprocessor=replace_number_to_symbol,
)

subject_vec = CountVectorizer(
    stop_words='english',
    preprocessor=replace_number_to_symbol
)

In [11]:
from sklearn.model_selection import StratifiedKFold, cross_validate

In [12]:
body_features = body_vec.fit_transform(train_datum.body.values)
subject_features = subject_vec.fit_transform(train_datum.subject.values)

train_X = hstack([body_features, subject_features], 'csr')
train_y = train_datum.label.values

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
cv_metrics = {'p': 'precision', 'r': 'recall', 'f': 'f1'}

### 単純ベイズでやってみる

In [13]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
result = cross_validate(
    X=train_X,
    y=train_y,
    estimator=mnb,
    scoring=cv_metrics,
    cv=skf
)
print_score(result)

fit time:   [0.00953674 0.00993919 0.00957203 0.00536442 0.00531197]
recall:     [0.95333333 0.96666667 0.99328859 0.96644295 0.98657718] 0.9732617449664429
precision:  [0.92857143 0.96666667 0.94871795 0.97297297 0.99324324] 0.9620344520344519
f1:         [0.94078947 0.96666667 0.9704918  0.96969697 0.98989899] 0.9675087806451049


### NBSvm

In [14]:
from nbsvm import NbSvmClassifier

In [15]:
nbsvm = NbSvmClassifier()

In [16]:
result = cross_validate(
    X=train_X,
    y=train_y,
    estimator=nbsvm,
    scoring=cv_metrics,
    cv=skf
)
print_score(result)

fit time:   [0.26386189 0.17120886 0.11504126 0.1214292  0.13589835]
recall:     [0.96666667 0.94       0.97315436 0.98657718 0.95973154] 0.9652259507829978
precision:  [0.9602649  0.9527027  0.97315436 0.96078431 0.90506329] 0.9503939141291585
f1:         [0.96345515 0.94630872 0.97315436 0.97350993 0.93159609] 0.9576048523460059


In [17]:
from sklearn.model_selection import cross_val_score

from skopt import gp_minimize
from skopt.space import Categorical
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Space

def objective(params):
    C, dual = params
    nbsvm.set_params(C=C, dual=dual)
    return -np.mean(
        cross_val_score(
            estimator=nbsvm,
            X=train_X,
            y=train_y,
            cv=skf,
            scoring='f1'
        ))

  from numpy.core.umath_tests import inner1d


In [18]:
space = Space(
    [
        Real(low=10**-5, high=10**5, prior='log-uniform', name='C'),
        Categorical(categories=[True, False], name='dual')
    ]
)

In [19]:
local_opt_params = gp_minimize(
    func=objective, dimensions=space, n_calls=100, n_jobs=-1)
print(local_opt_params.x)

[0.043328547181116804, True]


In [20]:
local_opt_params.fun

-0.9659039791200043

In [21]:
nbsvm = NbSvmClassifier(
    C=0.12658588843758548, dual=True)

In [22]:
mnb.fit(X=train_X, y=train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
test_mails = []
for path in sorted(Path('.').glob('./test2/test_*.txt')):
    with open(path, 'r') as f:
        subject = subject_pattern.sub(
            '', f.readline().strip()),
        body = f.read()
    test_mails.append(
        {
            'file_name': path.name,
            'subject': subject[0],
            'body': body
        }
    )
test_datum = pd.DataFrame.from_dict(test_mails)

In [24]:
body_features = body_vec.transform(test_datum.body.values)
subject_features = subject_vec.transform(test_datum.body.values)

test_X = hstack([body_features, subject_features], 'csr')

In [25]:
predict_y = mnb.predict(test_X)

In [26]:
submit_table = pd.concat(
    [test_datum.file_name, pd.Series(predict_y).rename('predict')],
    axis=1
)

In [27]:
submit_table.head()

Unnamed: 0,file_name,predict
0,test_0000.txt,1
1,test_0001.txt,0
2,test_0002.txt,1
3,test_0003.txt,0
4,test_0004.txt,0


In [28]:
submit_table.to_csv('submit.csv', index=False, header=False)

### LightGBM を使ってみる

In [14]:
from lightgbm import LGBMClassifier

In [15]:
lgbmc = LGBMClassifier(n_jobs=-1)

def objective_lgbmc(params):
    n_estimators, learning_rate, num_leaves, \
    colsample_bytree, subsample, max_depth, reg_alpha, \
    reg_lambda, min_split_gain, min_child_weight = params
    
    lgbmc.set_params(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_depth=max_depth,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        min_split_gain=min_split_gain,
        min_child_weight=min_child_weight,
    )
    return -np.mean(
        cross_val_score(
            estimator=lgbmc,
            X=train_X.astype(np.float32),
            y=train_y,
            cv=skf,
            scoring='f1'
        ))

In [31]:
space = Space(
    [
        Integer(low=100, high=5000),
        Real(low=10**-5, high=10**5, prior='log-uniform'),
        Integer(low=10, high=500),
        Real(low=10**-5, high=10**0, prior='log-uniform'),
        Real(low=10**-5, high=10**0, prior='log-uniform'),
        Integer(low=3, high=50),
        Real(low=10**-5, high=10**1, prior='log-uniform'),
        Real(low=10**-5, high=10**1, prior='log-uniform'),
        Real(low=10**-5, high=10**1, prior='log-uniform'),
        Real(low=10**-5, high=10**3, prior='log-uniform'),
    ]
)

x0 = [
    10000, 0.02, 34, 0.94, 0.87, 8,
    0.0412, 0.073, 0.02, 39.0
]

In [33]:
import warnings
warnings.filterwarnings('ignore')

lgbmc_params = gp_minimize(
    func=objective_lgbmc, dimensions=space, n_calls=100, n_jobs=-1)
print(lgbmc_params.x)

[100, 100000.0, 375, 1.0, 0.04359337870787355, 24, 1e-05, 10.0, 0.0014360946758322727, 0.0003135254916689524]


In [20]:
import warnings
warnings.filterwarnings('ignore')

lgbmc.set_params(
    n_estimators=100,
    learning_rate=100000.0,
    num_leaves=375,
    colsample_bytree=1.0,
    subsample=0.04359337870787355,
    max_depth=24,
    reg_alpha=1e-05,
    reg_lambda=10,
    min_split_gain=0.0014360946758322727,
    min_child_weight=0.0003135254916689524,
)

result = cross_validate(
    X=train_X.astype(np.float32),
    y=train_y,
    estimator=lgbmc,
    scoring=cv_metrics,
    cv=skf
)
print_score(result)

fit time:   [0.23914623 0.18578434 0.19642329 0.18406606 0.19972396]
recall:     [0.85333333 0.89333333 0.87919463 0.87248322 0.90604027] 0.8808769574944071
precision:  [0.73142857 0.71657754 0.71195652 0.72222222 0.72972973] 0.7223829170453212
f1:         [0.78769231 0.79525223 0.78678679 0.79027356 0.80838323] 0.7936776219524639
