In [20]:
#Importing required packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import lightgbm as lgb

%matplotlib inline

In [2]:
path = "input/"

wine = pd.read_csv(path + 'train.csv')
wine_test = pd.read_csv(path + 'test.csv')

In [3]:
wine['density'].fillna(wine['density'].mean(), inplace=True)
wine_test['density'].fillna(wine['density'].mean(), inplace=True)

wine['pH'].fillna(wine['pH'].mean(), inplace=True)
wine_test['pH'].fillna(wine['pH'].mean(), inplace=True)

In [4]:
X = wine.drop('quality', axis = 1)
y = wine['quality']

RandomForestRegressorモデル構築

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [27]:
rfr = RandomForestRegressor(n_estimators=200)
rfr.fit(X_train, y_train)
pred_rfr = rfr.predict(wine_test)

lightGBMのモデル構築

In [30]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [31]:
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'mse'},
        'num_leaves' : 30,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.8,
        'bagging_freq': 5,
}

In [32]:
gbm = lgb.train(params,
            lgb_train,
            num_boost_round=100,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)

[1]	valid_0's l2: 0.966953
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.92518
[3]	valid_0's l2: 0.896719
[4]	valid_0's l2: 0.861284
[5]	valid_0's l2: 0.834183
[6]	valid_0's l2: 0.822787
[7]	valid_0's l2: 0.808808
[8]	valid_0's l2: 0.806466
[9]	valid_0's l2: 0.798962
[10]	valid_0's l2: 0.79474
[11]	valid_0's l2: 0.788926
[12]	valid_0's l2: 0.784403
[13]	valid_0's l2: 0.781443
[14]	valid_0's l2: 0.776763
[15]	valid_0's l2: 0.774434
[16]	valid_0's l2: 0.771743
[17]	valid_0's l2: 0.77005
[18]	valid_0's l2: 0.7676
[19]	valid_0's l2: 0.764695
[20]	valid_0's l2: 0.761532
[21]	valid_0's l2: 0.758802
[22]	valid_0's l2: 0.758114
[23]	valid_0's l2: 0.760792
[24]	valid_0's l2: 0.760669
[25]	valid_0's l2: 0.764335
[26]	valid_0's l2: 0.763762
[27]	valid_0's l2: 0.763058
[28]	valid_0's l2: 0.763226
[29]	valid_0's l2: 0.763104
[30]	valid_0's l2: 0.763817
[31]	valid_0's l2: 0.762302
[32]	valid_0's l2: 0.76339
Early stopping, best iteration is:
[22]	valid_0's l2: 0.75

モデルアンサンブル

In [41]:
rfr_pred = rfr.predict(wine_test)

lgb_pred = gbm.predict(wine_test)

pred_proba = (rfr_pred + lgb_pred) / 2
pred = pred_proba.argmax()

In [43]:
pred_proba

array([5.32981939, 5.2760144 , 5.89694076, 6.22879541, 6.11406453,
       5.42279691, 5.17397551, 5.8559369 , 6.18734551, 5.3830113 ,
       6.63285631, 5.17733386, 6.40419314, 6.41762477, 5.82999983,
       5.12524084, 5.15258139, 6.13706802, 6.13113834, 5.90877455,
       5.69139947, 5.20303677, 5.75931396, 4.87105604, 5.47892423,
       5.8425545 , 5.01913304, 5.92508546, 5.38379752, 6.02511917,
       5.57614933, 5.57445369, 5.62901587, 5.36271193, 4.78621725,
       5.40871176, 4.92589609, 6.30450505, 5.39167125, 5.24478981,
       5.43108631, 6.59194294, 5.5993616 , 5.12233177, 5.29934884,
       5.66898372, 5.51398897, 5.51254483, 5.45856234, 6.25923435,
       5.47648566, 6.56182379, 5.58857596, 5.68626875, 5.39017003,
       5.03776453, 6.06080781, 6.09216101, 5.66821908, 5.80888456,
       6.66198232, 5.02633399, 5.87141469, 5.04409069, 5.00757946,
       6.45025061, 5.21927205, 5.30049289, 5.15438178, 5.25968253,
       5.44286489, 5.86688515, 5.77043383, 5.53579491, 5.40282

In [46]:
submission = pd.read_csv("submission.csv")
submission['quality'] = pred_proba
submission.to_csv("submission2.csv", index=False)