In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("./train.csv")
features = train.drop(columns="quality")
print(features.keys())
keys = ["type", "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides",'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype='object')


In [3]:
dim = len(keys)
X_label = np.ndarray(shape=(dim + 1,len(features)),dtype=float)
for i in range(len(features)):
    X_label[0][i] = 1.0
for d, key in enumerate(keys):
    X_label[d+1] = [x if x != "red" and x != "white" else (1.0 if x == "red" else 0.0) for x in features[key].values]

Y_label = train["quality"].values
Y_label = Y_label.reshape((1,len(Y_label)))

X_label = X_label.transpose() # train_test_splitするために転置を取る
Y_label = Y_label.transpose() # train_test_splitするために転置を取る

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X_label, Y_label,test_size=0.4655,random_state=212)

# print(X_train.shape, X_train)
# print(Y_train.shape, Y_train)
# print(X_test.shape, X_test)
# print(Y_test.shape, Y_test)

X_avg = [0] * dim
for x in X_train:
    for i in range(dim):
        if not np.isnan(x[i]):
            X_avg[i] += x[i]
for i in range(dim):
    X_avg[i] /= len(X_train)

for x in X_train:
    for i in range(dim):
        if np.isnan(x[i+1]):
            x[i+1] = X_avg[i]

X_train = X_train.transpose()
# X_test = X_test.transpose()
Y_train = Y_train.transpose()
# Y_test = Y_test.transpose()

In [5]:
B = X_train.dot(Y_train.T)
A = X_train.dot(X_train.T)

L = 0.00001 # 正則化の程度を決めるハイパーパラメータ
LI = np.identity(dim + 1) * L
LI[0][0] = 0

AT = np.linalg.inv(A + LI)
w = AT.dot(B)
print(w)

[[ 6.64473219e+01]
 [ 6.50740910e-02]
 [ 6.28857558e-02]
 [-6.13470179e-01]
 [ 9.44054603e-02]
 [ 4.37201402e-02]
 [-1.41208375e-01]
 [ 5.37504265e-03]
 [-1.45523370e-03]
 [-6.53666033e+01]
 [ 2.94521504e-01]
 [ 3.41684780e-01]
 [ 2.61455058e-01]]


In [6]:
print(len(X_avg), len(X_test[0]))
Y_test_pred = np.array([sum((x[i] if not np.isnan(x[i]) else X_avg[i-1])*w[i] for i in range(len(x)))[0] for x in X_test])
print('MSE test data: ', mean_squared_error(Y_test, Y_test_pred))

12 13
MSE test data:  0.6127319068778668


In [7]:
# output prediction for the test data
test = pd.read_csv("./test.csv")

In [8]:
submission = pd.read_csv("./sample_submission.csv", index_col="id")
submission["quality"] = [w[0][0] + sum(w[i+1][0] * ((test[key][n] if not np.isnan(test[key][n]) else X_avg[i]) if key != "type" else (1.0 if test[key][n] == "red" else 0.0)) for i, key in enumerate(keys)) for n in range(len(test))]
submission.to_csv("submission.csv")