# 3-4. モデル作成

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score

In [None]:
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## データの準備

In [None]:
train = pd.read_csv("data/converted_train.csv")
test = pd.read_csv("data/converted_test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
X_train = train.drop(["label"], axis=1)
y_train = train["label"]
X_test = test.drop(["label"], axis=1)
y_test = test["label"]

## モデルの訓練

In [None]:
# LightGBM
param = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "num_leaves": 10, 
    "n_estimators": 50,
}
lgb_model = lgb.LGBMClassifier(**param)

In [None]:
# Random Forest
random_forest = RandomForestClassifier()

In [None]:
# Logistic Regression
logistic_regression = LogisticRegression()

In [None]:
# Support Vector Machine
svc = SVC()

In [None]:
model = lgb_model
# model = random_forest
# model = logistic_regression
# model = svc

In [None]:
model.fit(X_train, y_train)

## テスト

In [None]:
y_pred = model.predict_proba(X_test)[:, 1]

In [None]:
y_pred

In [None]:
accuracy_score(y_test, y_pred>0.5)

## アンダーサンプリング
labelの偏りが見られるため、アンダーサンプリングを試します。

In [None]:
y_train.value_counts()

In [None]:
sampler = RandomUnderSampler()
X_us, y_us = sampler.fit_resample(X_train, y_train)

In [None]:
y_us.value_counts()

上ではLightGBMのscikit-learn APIを使用しましたが、ここではLightGBM独自のAPIを使用しています。

In [None]:
train_data = lgb.Dataset(X_us, label=y_us)
# train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
param = {
    "objective": "binary",
    "num_leaves": 10, 
    "metric": ["auc", "binary_logloss"],
}
num_boost_round = 50
train_result = {}
callbacks = [
    lgb.log_evaluation(),
    lgb.record_evaluation(train_result),
    # lgb.early_stopping(stopping_rounds=5),
]
booster = lgb.train(param, train_data, num_boost_round, valid_sets=[valid_data], valid_names=["Test"], callbacks=callbacks)

In [None]:
y_pred = booster.predict(X_test)
accuracy_score(y_test, y_pred>0.5)

In [None]:
# See feature importance
lgb.plot_importance(booster, figsize=(16, 6))

In [None]:
fig = plt.figure(figsize=(16, 6))
ax1 = fig.add_subplot(111)
 
# Set labels
ax1.set_xlabel("Iteration")
ax1.set_ylabel("binary logloss")

# Plot the result
ax1.plot(train_result["Test"]["binary_logloss"], label="test loss")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(16, 6))
ax1 = fig.add_subplot(111)
 
# Set labels
ax1.set_xlabel("Iteration")
ax1.set_ylabel("binary logloss")

# Plot the result
ax1.plot(train_result["Test"]["auc"], label="auc")
plt.legend()
plt.show()

次回以降も使うため、LightGBMモデルを保存します。

In [None]:
booster.save_model("model/lightgbm_model.txt")