In [1]:
import numpy as np
import lightgbm as lgb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = "input/"

wine = pd.read_csv(path + 'train.csv')
wine_test = pd.read_csv(path + 'test.csv')

## 1. 前処理  

In [3]:
#欠損値を平均値埋め
wine['density'].fillna(wine['density'].mean(), inplace=True)
wine_test['density'].fillna(wine['density'].mean(), inplace=True)

wine['pH'].fillna(wine['pH'].mean(), inplace=True)
wine_test['pH'].fillna(wine['pH'].mean(), inplace=True)

In [4]:
wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

## 2.モデル作成

In [5]:
X = wine.drop('quality',axis = 1).values
y = wine.quality.values

X_test = wine_test.values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)

In [7]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [8]:
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'mse'},
        'num_leaves' : 30,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.8,
        'bagging_freq': 5,
}

In [9]:
gbm = lgb.train(params,
            lgb_train,
            num_boost_round=100,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)

[1]	valid_0's l2: 0.932362
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.897083
[3]	valid_0's l2: 0.868895
[4]	valid_0's l2: 0.845331
[5]	valid_0's l2: 0.826173
[6]	valid_0's l2: 0.818764
[7]	valid_0's l2: 0.801793
[8]	valid_0's l2: 0.794325
[9]	valid_0's l2: 0.783086
[10]	valid_0's l2: 0.772373
[11]	valid_0's l2: 0.769905
[12]	valid_0's l2: 0.760014
[13]	valid_0's l2: 0.759115
[14]	valid_0's l2: 0.756329
[15]	valid_0's l2: 0.754059
[16]	valid_0's l2: 0.749994
[17]	valid_0's l2: 0.749722
[18]	valid_0's l2: 0.750721
[19]	valid_0's l2: 0.749541
[20]	valid_0's l2: 0.748828
[21]	valid_0's l2: 0.744899
[22]	valid_0's l2: 0.747662
[23]	valid_0's l2: 0.746208
[24]	valid_0's l2: 0.745631
[25]	valid_0's l2: 0.743858
[26]	valid_0's l2: 0.743556
[27]	valid_0's l2: 0.743745
[28]	valid_0's l2: 0.745046
[29]	valid_0's l2: 0.74688
[30]	valid_0's l2: 0.747737
[31]	valid_0's l2: 0.747775
[32]	valid_0's l2: 0.749892
[33]	valid_0's l2: 0.752472
[34]	valid_0's l2: 0.7530

In [16]:
y_pred = gbm.predict(wine_test, num_iteration=gbm.best_iteration)

In [17]:
submission = pd.read_csv("submission.csv")
submission['quality'] = y_pred
submission

Unnamed: 0,quality
0,5.385026
1,5.304676
2,5.690298
3,6.282969
4,6.205937
5,5.372531
6,5.640007
7,5.856510
8,6.278181
9,5.204278


In [33]:
submission.to_csv('submission.csv', index=False)