# 資料探勘：紅酒品質預測

dataset description: https://archive.ics.uci.edu/ml/datasets/wine+quality

### Polynomial Regression  

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

## 使用pandas讀取 winequality-red.csv
df = pd.read_csv('./dataset/winequality-red.csv')

## 定義特徵向量X以及label y
y = df['quality']
X = df.drop('quality', axis = 1)    # quality 以外的 column 叫 X

## 使用PolynomialFeatures將維度提高(假設degree=2)
poly = PolynomialFeatures(degree = 2).fit(X)
X = poly.transform(X)

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

## 標準化訓練資料
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

## 標準化測試資料
X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

# The coefficients
print('Coefficients:\n', model.coef_)
print('')
# The mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

# Explained variance score: 1 is perfect prediction
print('R2 score:', r2_score(y_test, y_pred))


Coefficients:
 [-2.20439093e-11 -3.32036477e+01 -3.46762494e+01 -1.98569253e+01
 -1.40652656e+01 -6.85273381e+01 -7.74114997e+01  1.00727869e+02
 -2.50880009e+01 -7.93478035e+01  4.63945174e+01 -6.70038809e+00
 -1.07912073e+00 -4.28103408e-01 -1.80118094e-01 -5.27741073e-01
 -9.30638006e-01 -6.23191815e-01  5.34638897e-01  3.69394013e+01
 -1.63849368e+00  6.25798078e-01  6.86240625e-02 -1.07543978e-01
  3.25708843e-02 -6.49414077e-02  2.15989372e-01 -5.66509424e-02
  3.43575308e-01  3.51401508e+01 -1.05497188e+00 -2.55960941e-01
  8.98178254e-01 -3.09937110e-02  9.34787505e-02  8.98840857e-02
  1.48808583e-01 -3.20881226e-02  2.21970228e+01 -3.13608092e+00
 -3.12173913e-01  1.11046257e+00 -1.59627758e-01 -2.03532456e-01
  3.43111110e-02 -6.51049711e-04  1.79109779e+01 -2.69409190e+00
  1.29726359e-02 -3.65247752e-01  6.44755908e-02 -5.20131174e-02
 -1.18159029e-01  6.92374227e+01 -5.56852862e-01  1.09112468e-01
  5.56498217e-01 -1.63840021e-01 -3.70231215e-02  7.79292790e+01
 -5.470204

### Use Lasso to do feature selection first and then do polynomial regression

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


df = pd.read_csv('./dataset/winequality-red.csv')

y = df['quality']
X = df.drop(['quality'], axis=1)

print('X shape: {}'.format(X.shape))
print('y shape: {}\n'.format(y.shape))

## 把 X 做 normalization
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
X = pd.DataFrame(X)

## 建立Lasso模型(假設alpha=0.1)
model = linear_model.Lasso(alpha = 0.1)

# Train the model using the training sets
model.fit(X, y)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))

## 顯示出有多少個係數不為 0
print('number of model coef:{}'.format(np.sum(model.coef_ != 0 )))


X shape: (1599, 11)
y shape: (1599,)

Coefficients: [ 0.         -0.15459205  0.          0.         -0.         -0.
 -0.         -0.         -0.          0.03926141  0.24947033]

number of model coef:3


In [2]:
print('before feature extraction\nX shape: {}\n'.format(X.shape))
print(X.head())


before feature extraction
X shape: (1599, 11)

         0         1         2         3         4         5         6   \
0 -0.528360  0.961877 -1.391472 -0.453218 -0.243707 -0.466193 -0.379133   
1 -0.298547  1.967442 -1.391472  0.043416  0.223875  0.872638  0.624363   
2 -0.298547  1.297065 -1.186070 -0.169427  0.096353 -0.083669  0.229047   
3  1.654856 -1.384443  1.484154 -0.453218 -0.264960  0.107592  0.411500   
4 -0.528360  0.961877 -1.391472 -0.453218 -0.243707 -0.466193 -0.379133   

         7         8         9         10  
0  0.558274  1.288643 -0.579207 -0.960246  
1  0.028261 -0.719933  0.128950 -0.584777  
2  0.134264 -0.331177 -0.048089 -0.584777  
3  0.664277 -0.979104 -0.461180 -0.584777  
4  0.558274  1.288643 -0.579207 -0.960246  


In [3]:
mask = model.coef_ != 0
print(mask)
print('')

## 使用mask將係數為0對應的特徵剔除
X = X.iloc[:, mask]

print('after feature extraction\nX shape: {}\n'.format(X.shape))
print(X.head())


[False  True False False False False False False False  True  True]

after feature extraction
X shape: (1599, 3)

         1         9         10
0  0.961877 -0.579207 -0.960246
1  1.967442  0.128950 -0.584777
2  1.297065 -0.048089 -0.584777
3 -1.384443 -0.461180 -0.584777
4  0.961877 -0.579207 -0.960246


In [4]:
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

## 將抽取過後的特徵做PolynomialFeatures提升維度(假設degree=2)
poly = PolynomialFeatures(degree = 2).fit(X)
X_poly = poly.transform(X)

X_train,X_test,y_train,y_test=train_test_split(X_poly, y, test_size = 0.3, random_state = 1) #random_state 種子值

# 標準化訓練資料
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

# 標準化測試資料
X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))

# The mean squared error
print("Mean squared error: {}".format(mean_squared_error(y_test, y_pred)))

# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(y_test, y_pred)))


Coefficients: [ 0.         -0.1979168   0.22973068  0.33944012 -0.02937768 -0.02456795
 -0.0200283  -0.14557193  0.05641426 -0.04929108]

Mean squared error: 0.4059700497739832
R2 score: 0.3329898562520004
