In [1]:
import numpy as np
import lightgbm as lgb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = "input/"

wine = pd.read_csv(path + 'train.csv')
wine_test = pd.read_csv(path + 'test.csv')

## 1. 前処理  

In [3]:
#欠損値を平均値埋め
wine['density'].fillna(wine['density'].mean(), inplace=True)
wine_test['density'].fillna(wine['density'].mean(), inplace=True)

wine['pH'].fillna(wine['pH'].mean(), inplace=True)
wine_test['pH'].fillna(wine['pH'].mean(), inplace=True)

In [4]:
wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

## 2.モデル作成

In [5]:
X = wine.drop('quality',axis = 1).values
y = wine.quality.values

X_test = wine_test.values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)

In [7]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [8]:
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'mse'},
        'num_leaves' : 30,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.8,
        'bagging_freq': 5,
}

In [9]:
gbm = lgb.train(params,
            lgb_train,
            num_boost_round=100,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)

[1]	valid_0's l2: 0.947771
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.909509
[3]	valid_0's l2: 0.883641
[4]	valid_0's l2: 0.856905
[5]	valid_0's l2: 0.841117
[6]	valid_0's l2: 0.829531
[7]	valid_0's l2: 0.815502
[8]	valid_0's l2: 0.804693
[9]	valid_0's l2: 0.794684
[10]	valid_0's l2: 0.786709
[11]	valid_0's l2: 0.779041
[12]	valid_0's l2: 0.767304
[13]	valid_0's l2: 0.76594
[14]	valid_0's l2: 0.761005
[15]	valid_0's l2: 0.756511
[16]	valid_0's l2: 0.748882
[17]	valid_0's l2: 0.747803
[18]	valid_0's l2: 0.743151
[19]	valid_0's l2: 0.743334
[20]	valid_0's l2: 0.740392
[21]	valid_0's l2: 0.742712
[22]	valid_0's l2: 0.745317
[23]	valid_0's l2: 0.748539
[24]	valid_0's l2: 0.753608
[25]	valid_0's l2: 0.755138
[26]	valid_0's l2: 0.752677
[27]	valid_0's l2: 0.752911
[28]	valid_0's l2: 0.753066
[29]	valid_0's l2: 0.753999
[30]	valid_0's l2: 0.752365
Early stopping, best iteration is:
[20]	valid_0's l2: 0.740392


In [10]:
y_pred = gbm.predict(wine_test, num_iteration=gbm.best_iteration)

In [12]:
submission = pd.read_csv("model/submission.csv")
submission['quality'] = y_pred
submission

Unnamed: 0,quality
0,5.487625
1,5.335005
2,5.648344
3,6.005948
4,6.335438
5,5.261724
6,5.397087
7,5.922401
8,6.054386
9,5.156132


In [14]:
submission.to_csv('model/model_gbm.csv', index=False)