In [15]:
#Importing required packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import lightgbm as lgb

%matplotlib inline

In [16]:
path = "input/"

wine = pd.read_csv(path + 'train.csv')
wine_test = pd.read_csv(path + 'test.csv')

In [17]:
wine['density'].fillna(wine['density'].mean(), inplace=True)
wine_test['density'].fillna(wine['density'].mean(), inplace=True)

wine['pH'].fillna(wine['pH'].mean(), inplace=True)
wine_test['pH'].fillna(wine['pH'].mean(), inplace=True)

In [18]:
X = wine.drop('quality', axis = 1)
y = wine['quality']

RandomForestRegressorモデル構築

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [20]:
rfr = RandomForestRegressor(n_estimators=200)
rfr.fit(X_train, y_train)
pred_rfr = rfr.predict(wine_test)

lightGBMのモデル構築

In [21]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [22]:
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'mse'},
        'num_leaves' : 30,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.8,
        'bagging_freq': 5,
}

In [23]:
gbm = lgb.train(params,
            lgb_train,
            num_boost_round=100,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)

[1]	valid_0's l2: 0.966953
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.92518
[3]	valid_0's l2: 0.896719
[4]	valid_0's l2: 0.861284
[5]	valid_0's l2: 0.834183
[6]	valid_0's l2: 0.822787
[7]	valid_0's l2: 0.808808
[8]	valid_0's l2: 0.806466
[9]	valid_0's l2: 0.798962
[10]	valid_0's l2: 0.79474
[11]	valid_0's l2: 0.788926
[12]	valid_0's l2: 0.784403
[13]	valid_0's l2: 0.781443
[14]	valid_0's l2: 0.776763
[15]	valid_0's l2: 0.774434
[16]	valid_0's l2: 0.771743
[17]	valid_0's l2: 0.77005
[18]	valid_0's l2: 0.7676
[19]	valid_0's l2: 0.764695
[20]	valid_0's l2: 0.761532
[21]	valid_0's l2: 0.758802
[22]	valid_0's l2: 0.758114
[23]	valid_0's l2: 0.760792
[24]	valid_0's l2: 0.760669
[25]	valid_0's l2: 0.764335
[26]	valid_0's l2: 0.763762
[27]	valid_0's l2: 0.763058
[28]	valid_0's l2: 0.763226
[29]	valid_0's l2: 0.763104
[30]	valid_0's l2: 0.763817
[31]	valid_0's l2: 0.762302
[32]	valid_0's l2: 0.76339
Early stopping, best iteration is:
[22]	valid_0's l2: 0.75

モデルアンサンブル

In [24]:
rfr_pred = rfr.predict(wine_test)

lgb_pred = gbm.predict(wine_test)

pred_proba = (rfr_pred + lgb_pred) / 2
pred = pred_proba.argmax()

In [25]:
pred_proba

array([5.31331939, 5.3292644 , 5.84719076, 6.27279541, 6.10706453,
       5.39204691, 5.20122551, 5.8376869 , 6.27584551, 5.3435113 ,
       6.56685631, 5.15383386, 6.31319314, 6.40112477, 5.88099983,
       5.09874084, 5.20083139, 6.13631802, 6.13438834, 5.91202455,
       5.69864947, 5.20653677, 5.84356396, 4.87680604, 5.49192423,
       5.8955545 , 4.90688304, 5.94708546, 5.38229752, 6.09086917,
       5.57089933, 5.52570369, 5.68476587, 5.39371193, 4.75771725,
       5.38321176, 4.99539609, 6.34475505, 5.35942125, 5.26428981,
       5.40808631, 6.56819294, 5.6446116 , 5.10808177, 5.35584884,
       5.62173372, 5.56298897, 5.47704483, 5.49606234, 6.28273435,
       5.46973566, 6.52832379, 5.58157596, 5.72176875, 5.40617003,
       5.04351453, 6.10605781, 6.07941101, 5.67321908, 5.89938456,
       6.60748232, 5.01608399, 5.91866469, 5.00584069, 5.04057946,
       6.44825061, 5.24377205, 5.25974289, 5.17363178, 5.28518253,
       5.45111489, 5.80188515, 5.81318383, 5.49254491, 5.46507

In [27]:
submission = pd.read_csv("model/submission.csv")
submission['quality'] = pred_proba
submission.to_csv("model/model_rfr_gbm.csv", index=False)