In [50]:
#https://www.kaggle.com/jhansia/regression-models-analysis-on-the-wine-quality
#https://qiita.com/koshian2/items/1c0f781d244a6046b83e

In [335]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import optuna

In [336]:
path = "input/"

wine = pd.read_csv(path + 'train.csv')
wine_test = pd.read_csv(path + 'test.csv')

In [337]:
wine.isnull().sum()

fixed acidity             0
volatile acidity          0
citric acid               0
residual sugar            0
chlorides                 0
free sulfur dioxide       0
total sulfur dioxide      0
density                 122
pH                      137
sulphates                 0
alcohol                   0
quality                   0
dtype: int64

In [338]:
#平均値埋め
wine['density'].fillna(wine['density'].mean(), inplace=True)
wine_test['density'].fillna(wine['density'].mean(), inplace=True)

wine['pH'].fillna(wine['pH'].mean(), inplace=True)
wine_test['pH'].fillna(wine['pH'].mean(), inplace=True)

wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [339]:
#feature engineering
'''
#酸化防止剤（SO2）の算出
wine['mso2']= wine['free sulfur dioxide']/(1+10**(wine['pH']-1.8))
wine_test['mso2']= wine_test['free sulfur dioxide']/(1+10**(wine['pH']-1.8))


wine['alc_ave_upper'] = wine['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)
wine_test['alc_ave_upper'] = wine_test['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)
'''

"\n#酸化防止剤（SO2）の算出\nwine['mso2']= wine['free sulfur dioxide']/(1+10**(wine['pH']-1.8))\nwine_test['mso2']= wine_test['free sulfur dioxide']/(1+10**(wine['pH']-1.8))\n\n\nwine['alc_ave_upper'] = wine['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)\nwine_test['alc_ave_upper'] = wine_test['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)\n"

In [340]:
wine_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,8.1,0.39,0.49,2.3,0.1,47.6,134.4,0.997019,3.39,1.0,9.7
1,7.0,0.5,0.49,5.6,0.06,25.8,121.7,0.9891,3.3,0.77,10.7
2,8.1,0.5,0.39,2.5,0.082,12.0,46.0,1.0048,3.38,0.61,10.5
3,6.4,0.38,0.33,3.3,0.046,12.0,52.6,0.9832,3.38,0.63,12.3
4,8.9,0.31,0.46,1.4,0.059,10.9,24.7,0.9931,3.37,0.79,10.2


モデル構築

In [341]:
y = wine.quality
X = wine.drop(['quality'],axis = 1)
X_test = wine_test.iloc[:, 0:].values

In [342]:
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size=0.3, random_state = 42)

In [343]:
#scaling

scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
X_test_scaled = scaler.transform(X_test)

In [344]:
test_x_scaled = scaler.transform(test_x)

In [345]:
models = []
models.append(('DecisionTree', DecisionTreeRegressor()))
models.append(('RandomForest', RandomForestRegressor()))
models.append(('GradienBoost', GradientBoostingRegressor()))
models.append(('SVR', SVR()))
names = []

In [346]:
for name,model in models:
    kfold = model_selection.KFold(n_splits=5,random_state=42)
    cv_results = model_selection.cross_val_score(model,train_x_scaled,train_y, cv= kfold, scoring = 'neg_mean_squared_error')
    names.append(name)
    msg  = "%s: %f" % (name, -1*(cv_results).mean())
    print(msg)

DecisionTree: 1.420229
RandomForest: 0.798819
GradienBoost: 0.768838
SVR: 0.755632


In [347]:
model = SVR()
model.fit(train_x_scaled,train_y)
pred_y = model.predict(test_x_scaled)

In [348]:
mean_squared_error(pred_y,test_y)

0.7476197790125285

In [311]:
def objective(trial):
    # C
    svr_c = trial.suggest_loguniform('svr_c', 1e0, 1e2)
    # epsilon
    epsilon = trial.suggest_loguniform('epsilon', 1e-1, 1e1)
    # SVR
    svr = SVR(C=svr_c, epsilon=epsilon)
    svr.fit(train_x, train_y)
    # 予測
    y_pred = svr.predict(test_x)
    # CrossvalidationのMSEで比較（最大化がまだサポートされていない）
    return mean_squared_error(test_y, y_pred)

In [312]:
'''
# optuna
study = optuna.create_study()
study.optimize(objective, n_trials=100)

# 最適解
print(study.best_params)
print(study.best_value)
print(study.best_trial)
'''
#'svr_c': 1.0003295622167319, 'epsilon': 0.26741161895009796

'\n# optuna\nstudy = optuna.create_study()\nstudy.optimize(objective, n_trials=100)\n\n# 最適解\nprint(study.best_params)\nprint(study.best_value)\nprint(study.best_trial)\n'

In [349]:
model1 = SVR(C=1.0003295622167319, epsilon=0.26741161895009796)
model2 = GradientBoostingRegressor(random_state=42)
model3 = RandomForestRegressor(random_state=42)

model1.fit(train_x_scaled,train_y)
model2.fit(train_x_scaled,train_y)
model3.fit(train_x_scaled,train_y)

svr_pred = model1.predict(test_x_scaled)
gbr_pred = model2.predict(test_x_scaled)
rfr_pred = model3.predict(test_x_scaled)

pred_y = (svr_pred + gbr_pred) / 2

mean_squared_error(pred_y,test_y)

0.7284565142218978

In [318]:
sub = (model1.predict(X_test_scaled) + model2.predict(X_test_scaled))/2
submission = pd.read_csv("model/submission.csv")
submission['quality'] = sub
submission.to_csv('model/SVM_gbm.csv', index=False)

In [None]:
0.7269720822910998