In [50]:
#https://www.kaggle.com/jhansia/regression-models-analysis-on-the-wine-quality
#https://qiita.com/koshian2/items/1c0f781d244a6046b83e
#https://qiita.com/mizutaku17/items/6bd8e778b6e9ec7b162d

In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna

In [57]:
path = "input/"

wine = pd.read_csv(path + 'train.csv')
wine_test = pd.read_csv(path + 'test.csv')

In [58]:
#平均値埋め
wine['density'].fillna(wine['density'].mean(), inplace=True)
wine_test['density'].fillna(wine['density'].mean(), inplace=True)

wine['pH'].fillna(wine['pH'].mean(), inplace=True)
wine_test['pH'].fillna(wine['pH'].mean(), inplace=True)

wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [59]:
wine = wine.drop("residual sugar", axis = 1)
wine_test = wine_test.drop("residual sugar", axis=1)

In [4]:
#feature engineering
'''
#酸化防止剤（SO2）の算出
wine['mso2']= wine['free sulfur dioxide']/(1+10**(wine['pH']-1.8))
wine_test['mso2']= wine_test['free sulfur dioxide']/(1+10**(wine['pH']-1.8))

#alcが平均よりも高いサンプルに1，ほか0
wine['alc_ave_upper'] = wine['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)
wine_test['alc_ave_upper'] = wine_test['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)
'''

"\n#酸化防止剤（SO2）の算出\nwine['mso2']= wine['free sulfur dioxide']/(1+10**(wine['pH']-1.8))\nwine_test['mso2']= wine_test['free sulfur dioxide']/(1+10**(wine['pH']-1.8))\n\n#alcが平均よりも高いサンプルに1，ほか0\nwine['alc_ave_upper'] = wine['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)\nwine_test['alc_ave_upper'] = wine_test['alcohol'].apply(lambda x : 1 if x >= wine.alcohol.mean() else 0)\n"

In [60]:
def outlier_iqr(df):
    for i in range(len(df.columns)):
        col = df.iloc[:,i]

        q1 = col.describe()['25%']
        q3 = col.describe()['75%']
        iqr = q3 - q1 #四分位範囲

        # 外れ値の基準点
        outlier_min = q1 - (iqr) * 2.3
        outlier_max = q3 + (iqr) * 2.3

        # 範囲から外れている値を除く
        col[col < outlier_min] = None
        col[col > outlier_max] = None

    return df

In [61]:
outlier_iqr(wine).isnull().sum()

fixed acidity            7
volatile acidity         4
citric acid              0
chlorides               58
free sulfur dioxide      8
total sulfur dioxide    10
density                  2
pH                       5
sulphates               26
alcohol                  1
quality                  1
dtype: int64

In [62]:
wine = outlier_iqr(wine).dropna(how = 'any', axis = 0)

モデル構築

In [63]:
y = wine.quality
X = wine.drop(['quality'],axis = 1)
X_test = wine_test.iloc[:, 0:].values

#split test data
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size=0.3, random_state = 42)

In [64]:
#scaling

scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
test_x_scaled = scaler.transform(test_x)
X_test_scaled = scaler.transform(X_test)

In [65]:
models = []
models.append(('RandomForest', RandomForestRegressor()))
models.append(('GradienBoost', GradientBoostingRegressor()))
models.append(('SVR', SVR()))
models.append(('Linear', linear_model.LinearRegression()))
names = []

In [66]:
for name,model in models:
    kfold = model_selection.KFold(n_splits=5,random_state=42)
    cv_results = model_selection.cross_val_score(model,train_x_scaled,train_y, cv= kfold, scoring = 'neg_mean_squared_error')
    names.append(name)
    msg  = "%s: %f" % (name, -1*(cv_results).mean())
    print(msg)

RandomForest: 0.803759
GradienBoost: 0.744312
SVR: 0.734105
Linear: 0.725190


In [67]:
model = SVR()
model.fit(train_x_scaled,train_y)
pred_y = model.predict(test_x_scaled)

In [68]:
mean_squared_error(pred_y,test_y)

0.7039403845009782

In [13]:
def svr_optimaze(trial):
    svr_c = trial.suggest_loguniform('svr_c', 1e0, 1e2)
    epsilon = trial.suggest_loguniform('epsilon', 1e-1, 1e1)
    
    svr = SVR(C=svr_c, epsilon=epsilon)
    svr.fit(train_x, train_y)
    
    y_pred = svr.predict(test_x)
    return mean_squared_error(test_y, y_pred)

In [109]:
'''
# optuna
study = optuna.create_study()
study.optimize(svr_optimaze, n_trials=100)

# 最適解
print(study.best_params)
print(study.best_value)
print(study.best_trial)
'''
#'svr_c': 1.003458214866715, 'epsilon': 0.7458180594811398

[I 2020-01-30 12:06:54,461] Finished trial#0 resulted in value: 0.9253846153846153. Current best value is 0.9253846153846153 with parameters: {'svr_c': 8.358568062992761, 'epsilon': 7.534131026758865}.
[I 2020-01-30 12:06:54,657] Finished trial#1 resulted in value: 1.1457466630057767. Current best value is 0.9253846153846153 with parameters: {'svr_c': 8.358568062992761, 'epsilon': 7.534131026758865}.
[I 2020-01-30 12:06:54,915] Finished trial#2 resulted in value: 1.2588309928676586. Current best value is 0.9253846153846153 with parameters: {'svr_c': 8.358568062992761, 'epsilon': 7.534131026758865}.
[I 2020-01-30 12:06:54,989] Finished trial#3 resulted in value: 0.9253846153846153. Current best value is 0.9253846153846153 with parameters: {'svr_c': 8.358568062992761, 'epsilon': 7.534131026758865}.
[I 2020-01-30 12:06:55,056] Finished trial#4 resulted in value: 0.9253846153846154. Current best value is 0.9253846153846153 with parameters: {'svr_c': 8.358568062992761, 'epsilon': 7.53413102

{'svr_c': 1.000588153385211, 'epsilon': 0.6389110614945105}
0.8120606169227546
FrozenTrial(number=93, value=0.8120606169227546, datetime_start=datetime.datetime(2020, 1, 30, 12, 7, 5, 537537), datetime_complete=datetime.datetime(2020, 1, 30, 12, 7, 5, 656102), params={'svr_c': 1.000588153385211, 'epsilon': 0.6389110614945105}, distributions={'svr_c': LogUniformDistribution(high=100.0, low=1.0), 'epsilon': LogUniformDistribution(high=10.0, low=0.1)}, user_attrs={}, system_attrs={'_number': 93}, intermediate_values={}, trial_id=93, state=TrialState.COMPLETE)


In [110]:
model1 = SVR(C=1.000588153385211,
             epsilon=0.6389110614945105)
model2 = GradientBoostingRegressor(learning_rate=0.1,
                                   max_depth=4,
                                   max_features=1.0,
                                   min_samples_leaf=3,
                                   random_state=42)
model3 = RandomForestRegressor(random_state=42)
model4 = linear_model.LinearRegression()

model1.fit(train_x_scaled,train_y)
model2.fit(train_x_scaled,train_y)
model3.fit(train_x_scaled,train_y)
model4.fit(train_x_scaled,train_y)

svr_pred = model1.predict(test_x_scaled)
gbr_pred = model2.predict(test_x_scaled)
rfr_pred = model3.predict(test_x_scaled)
lin_pred = model4.predict(test_x_scaled)

pred_y = (svr_pred + gbr_pred + lin_pred) / 3

mean_squared_error(pred_y,test_y)

0.6583663376615313

In [108]:
sub = np.round((model1.predict(X_test_scaled) + model2.predict(X_test_scaled)+model4.predict(X_test_scaled))/3,1)
submission = pd.read_csv("model/submission.csv")
submission['quality'] = sub
submission.to_csv('model/svr_gbr_lin_removeSugar_round.csv', index=False)


In [None]:
0.6583663376615313