In [50]:
#https://www.kaggle.com/jhansia/regression-models-analysis-on-the-wine-quality
#https://qiita.com/koshian2/items/1c0f781d244a6046b83e
#https://qiita.com/mizutaku17/items/6bd8e778b6e9ec7b162d

In [498]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import model_selection
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import optuna

In [553]:
path = "input/"

wine = pd.read_csv(path + 'train.csv')
wine_test = pd.read_csv(path + 'test.csv')

In [554]:
#平均値埋め
wine['density'].fillna(wine['density'].mean(), inplace=True)
wine_test['density'].fillna(wine['density'].mean(), inplace=True)

wine['pH'].fillna(wine['pH'].mean(), inplace=True)
wine_test['pH'].fillna(wine['pH'].mean(), inplace=True)

wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [555]:
#feature engineering
'''
#酸化防止剤（SO2）の算出
wine['mso2']= wine['free sulfur dioxide']/(1+10**(wine['pH']-1.8))
wine_test['mso2']= wine_test['free sulfur dioxide']/(1+10**(wine['pH']-1.8))

#alcが平均よりも高いサンプルに1，ほか0
wine['alc_ave_upper'] = wine['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)
wine_test['alc_ave_upper'] = wine_test['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)
'''

"\n#酸化防止剤（SO2）の算出\nwine['mso2']= wine['free sulfur dioxide']/(1+10**(wine['pH']-1.8))\nwine_test['mso2']= wine_test['free sulfur dioxide']/(1+10**(wine['pH']-1.8))\n\n#alcが平均よりも高いサンプルに1，ほか0\nwine['alc_ave_upper'] = wine['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)\nwine_test['alc_ave_upper'] = wine_test['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)\n"

In [556]:
def outlier_iqr(df):

    for i in range(len(df.columns)):

        # 列を抽出する
        col = df.iloc[:,i]

        # 四分位数
        q1 = col.describe()['25%']
        q3 = col.describe()['75%']
        iqr = q3 - q1 #四分位範囲

        # 外れ値の基準点
        outlier_min = q1 - (iqr) * 2.3
        outlier_max = q3 + (iqr) * 2.3

        # 範囲から外れている値を除く
        col[col < outlier_min] = None
        col[col > outlier_max] = None

    return df

In [557]:
wine = outlier_iqr(wine).dropna(how = 'any', axis = 0)

モデル構築

In [558]:
y = wine.quality
X = wine.drop(['quality'],axis = 1)
X_test = wine_test.iloc[:, 0:].values

#split test data
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size=0.3, random_state = 42)

In [559]:
#scaling

scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
X_test_scaled = scaler.transform(X_test)
test_x_scaled = scaler.transform(test_x)

In [560]:
models = []
models.append(('RandomForest', RandomForestRegressor()))
models.append(('GradienBoost', GradientBoostingRegressor()))
models.append(('SVR', SVR()))
models.append(('Linear', linear_model.LinearRegression()))
names = []

In [561]:
for name,model in models:
    kfold = model_selection.KFold(n_splits=5,random_state=42)
    cv_results = model_selection.cross_val_score(model,train_x_scaled,train_y, cv= kfold, scoring = 'neg_mean_squared_error')
    names.append(name)
    msg  = "%s: %f" % (name, -1*(cv_results).mean())
    print(msg)

RandomForest: 0.779109
GradienBoost: 0.776035
SVR: 0.756405
Linear: 0.696855


In [562]:
model = linear_model.LinearRegression()
model.fit(train_x_scaled,train_y)
pred_y = model.predict(test_x_scaled)

In [563]:
mean_squared_error(pred_y,test_y)

0.6691140824408425

In [311]:
def objective(trial):
    svr_c = trial.suggest_loguniform('svr_c', 1e0, 1e2)
    epsilon = trial.suggest_loguniform('epsilon', 1e-1, 1e1)
    
    svr = SVR(C=svr_c, epsilon=epsilon)
    svr.fit(train_x, train_y)
    
    y_pred = svr.predict(test_x)
    return mean_squared_error(test_y, y_pred)

In [312]:
'''
# optuna
study = optuna.create_study()
study.optimize(objective, n_trials=100)

# 最適解
print(study.best_params)
print(study.best_value)
print(study.best_trial)
'''
#'svr_c': 1.0003295622167319, 'epsilon': 0.26741161895009796

'\n# optuna\nstudy = optuna.create_study()\nstudy.optimize(objective, n_trials=100)\n\n# 最適解\nprint(study.best_params)\nprint(study.best_value)\nprint(study.best_trial)\n'

In [581]:
model1 = SVR(C=1.0003295622167319, epsilon=0.26741161895009796)
model2 = GradientBoostingRegressor(random_state=42)
model3 = RandomForestRegressor(random_state=42)
model4 = linear_model.LinearRegression()

model1.fit(train_x_scaled,train_y)
model2.fit(train_x_scaled,train_y)
model3.fit(train_x_scaled,train_y)
model4.fit(train_x_scaled,train_y)

svr_pred = model1.predict(test_x_scaled)
gbr_pred = model2.predict(test_x_scaled)
rfr_pred = model3.predict(test_x_scaled)
lin_pred = model4.predict(test_x_scaled)

pred_y = (svr_pred + gbr_pred + lin_pred + rfr_pred) / 4

mean_squared_error(pred_y,test_y)

0.6309534499813314

In [578]:
sub = (model1.predict(X_test_scaled) + model2.predict(X_test_scaled)+model3.predict(X_test_scaled)+model4.predict(X_test_scaled))/4
submission = pd.read_csv("model/submission.csv")
submission['quality'] = sub
submission.to_csv('model/4Model_removeOutlier.csv', index=False)