In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline

train = pd.read_csv("./train.csv")
features = train.drop(columns="quality")
print(features.keys())
keys = ["type", "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides",'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

dim = len(keys)
X_label = np.ndarray(shape=(dim * 3 + 1,len(features)),dtype=float)
for i in range(len(features)):
    X_label[0][i] = 1.0
for d, key in enumerate(keys):
    X_label[d + 1] = [x if x != "red" and x != "white" else (1.0 if x == "red" else 0.0) for x in features[key].values]
for d, key in enumerate(keys):
    X_label[dim + d + 1] = [x * x if x != "red" and x != "white" else (1.0 if x == "red" else 0.0) for x in features[key].values]
for d, key in enumerate(keys):
    X_label[2 * dim + d + 1] = [x * x * x if x != "red" and x != "white" else (1.0 if x == "red" else 0.0) for x in features[key].values]

Y_label = train["quality"].values
Y_label = Y_label.reshape((1,len(Y_label)))

X_label = X_label.transpose() # train_test_splitするために転置を取る
Y_label = Y_label.transpose() # train_test_splitするために転置を取る

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype='object')


In [66]:
X_train, X_test, Y_train, Y_test = train_test_split(X_label, Y_label,test_size=0.4655,random_state=212)

# print(X_train.shape, X_train)
# print(Y_train.shape, Y_train)
# print(X_test.shape, X_test)
# print(Y_test.shape, Y_test)

X_avg = [0] * dim * 3
for x in X_train:
    for i in range(dim):
        if not np.isnan(x[i]):
            X_avg[i] += x[i]

for i in range(dim):
    X_avg[i] /= len(X_train)
for i in range(dim):
    X_avg[i + dim] = X_avg[i] * X_avg[i]
for i in range(dim):
    X_avg[i + dim * 2] = X_avg[i + dim] * X_avg[i]

for x in X_train:
    for i in range(dim * 3):
        if np.isnan(x[i + 1]):
            x[i + 1] = X_avg[i]

X_train = X_train.transpose()
# X_test = X_test.transpose()
Y_train = Y_train.transpose()
# Y_test = Y_test.transpose()

B = X_train.dot(Y_train.T)
A = X_train.dot(X_train.T)
# B = np.where(np.isnan(X_train), 0, X_train).dot(np.where(np.isnan(Y_train.T), 0, Y_train.T))
# A = np.where(np.isnan(X_train), 0, X_train).dot(np.where(np.isnan(X_train.T), 0, X_train.T))

L = 0.1 # 正則化の程度を決めるハイパーパラメータ
LI = np.identity(dim * 3 + 1) * L
LI[0][0] = 0

AT = np.linalg.inv(A + LI)
w = AT.dot(B)
print(w)

[[ 2.35760643e+01]
 [ 5.65599116e-02]
 [ 2.50644013e-02]
 [-2.80606512e+00]
 [ 6.85314853e-01]
 [ 1.66159393e-02]
 [-6.61511693e-01]
 [ 3.38291089e-02]
 [-1.24064945e-02]
 [-5.36357282e-01]
 [-2.37552913e+00]
 [ 8.85279316e-01]
 [-3.96275198e+00]
 [ 5.65598101e-02]
 [-5.53761580e-03]
 [ 1.52480810e+00]
 [-1.72394814e+00]
 [ 3.45139630e-03]
 [ 1.17610397e-02]
 [-4.47921643e-04]
 [ 7.96324559e-05]
 [-1.05937049e+00]
 [ 1.08677443e+00]
 [-2.58592222e-01]
 [ 3.63820854e-01]
 [ 5.65598612e-02]
 [ 3.90483493e-04]
 [-1.54279655e-01]
 [ 6.50210740e-01]
 [-2.06571563e-04]
 [ 1.84530741e-02]
 [ 1.34827675e-06]
 [-1.78942970e-07]
 [-1.56919377e+00]
 [-1.47053836e-01]
 [-6.90814791e-03]
 [-1.01330438e-02]]


In [67]:
Y_test_pred = np.array([sum((x[i] if not np.isnan(x[i]) else X_avg[i-1])*w[i] for i in range(len(x)))[0] for x in X_test])
print('MSE test data: ', mean_squared_error(Y_test, Y_test_pred))

MSE test data:  1.24933691137297


In [71]:
# output prediction for the test data
test = pd.read_csv("./test.csv")

In [69]:
submission = pd.read_csv("./sample_submission.csv", index_col="id")
submission["quality"] = [w[0][0] + sum(w[i+1][0] * ((test[key][n] if not np.isnan(test[key][n]) else X_avg[i]) if key != "type" else (1.0 if test[key][n] == "red" else 0.0)) for i, key in enumerate(keys)) for n in range(len(test))]
submission.to_csv("submission.csv")