In [1]:
import sys

sys.path.append('../src/')

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

from preprocessing import clean_html, tokenize
from utils import load_dataset

### データロード

In [3]:
x, y = load_dataset('../data/amazon_reviews_multilingual_JP_v1_00.tsv', n=5000)
x = [clean_html(text, strip=True) for text in x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
vectorizer = TfidfVectorizer(tokenizer=tokenize)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

### チューニング

In [5]:
parameters ={
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.03, 0.1, 0.3, 0.7, 1, 1.01, 1.03, 1.07, 1.1, 1.3, 1.7, 3]
}
lr = LogisticRegression(solver='liblinear')
clf = GridSearchCV(lr, parameters, cv=5, n_jobs=-1)
clf.fit(x_train_vec, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'), n_jobs=-1,
             param_grid={'C': [0.01, 0.03, 0.1, 0.3, 0.7, 1, 1.01, 1.03, 1.07,
                               1.1, 1.3, 1.7, 3],
                         'penalty': ['l1', 'l2']})

In [7]:
best_clf = clf.best_estimator_
print(clf.best_params_)
print('Accuracy(best): {:.4f}'.format(clf.best_score_))

y_pred = best_clf.predict(x_test_vec)
score = accuracy_score(y_test, y_pred)
print('Accuracy(test): {:.4f}'.format(score))

{'C': 3, 'penalty': 'l2'}
Accuracy(best): 0.8331
Accuracy(test): 0.8540


### Optunaによるパラメータチューニング

In [11]:
import optuna

def objective(trial):
    # データロード
    x, y = load_dataset('../data/amazon_reviews_multilingual_JP_v1_00.tsv', n=5000)
    x = [clean_html(text, strip=True) for text in x]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    vectorizer = TfidfVectorizer(tokenizer=tokenize)
    x_train_vec = vectorizer.fit_transform(x_train)
    x_test_vec = vectorizer.transform(x_test)
    
    # ハイパーパラメータの設定
    C = trial.suggest_uniform('C', 0.01, 3)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    
    # モデル
    clf = LogisticRegression(solver='liblinear', penalty=penalty, C=C)
    scores = cross_val_score(clf, x_train_vec, y_train, cv=5)
    
    return score

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3)

[32m[I 2021-03-27 22:01:17,541][0m A new study created in memory with name: no-name-8938a92e-b5e3-42cd-a11a-9f9d16088221[0m
[32m[I 2021-03-27 22:04:30,075][0m Trial 0 finished with value: 0.854 and parameters: {'C': 0.8268377291792394, 'penalty': 'l2'}. Best is trial 0 with value: 0.854.[0m
[32m[I 2021-03-27 22:07:40,844][0m Trial 1 finished with value: 0.854 and parameters: {'C': 0.11348850328069093, 'penalty': 'l1'}. Best is trial 0 with value: 0.854.[0m
[32m[I 2021-03-27 22:10:51,068][0m Trial 2 finished with value: 0.854 and parameters: {'C': 0.4669577707899371, 'penalty': 'l1'}. Best is trial 0 with value: 0.854.[0m


In [14]:
trial = study.best_trial
print('Accuracy: {:.4f}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Accuracy: 0.8540
Best hyperparameters: {'C': 0.8268377291792394, 'penalty': 'l2'}


In [15]:
clf = LogisticRegression(solver='liblinear', **trial.params)
clf.fit(x_train_vec, y_train)

y_pred = clf.predict(x_test_vec)
score = accuracy_score(y_test, y_pred)
print('Accuracy(test): {:.4f}'.format(score))

Accuracy(test): 0.8420
