In [27]:
import numpy as np

# 1. Data Load

In [249]:
data = open('data/winequality-red.csv', mode='rt')
dataset = []
labels = []

In [250]:
print(data.readline().strip().split(';'))
for i in range(1599):
    temp = data.readline().strip().split(';')
    dataset.append(temp[:11])
    labels.append(temp[-1])
print(np.shape(dataset))
print(np.shape(labels))
data.close()

['"fixed acidity"', '"volatile acidity"', '"citric acid"', '"residual sugar"', '"chlorides"', '"free sulfur dioxide"', '"total sulfur dioxide"', '"density"', '"pH"', '"sulphates"', '"alcohol"', '"quality"']


In [255]:
dataset = np.array(dataset, dtype=np.float32)
labels = np.array(labels, dtype=np.int32)

In [256]:
print(dataset.shape)
print(labels.shape)

(1599, 11)
(1599,)


In [257]:
train_x = dataset[:1232].copy()
train_y = labels[:1232].copy()
test_x = dataset[1232:].copy()
test_y = labels[1232:].copy()

In [258]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(1232, 11)
(1232,)
(367, 11)
(367,)


## 1.1 Standard Normalize 

In [259]:
print(train_x[0])

train_x_std = (train_x - np.mean(train_x, axis=0)) / np.std(train_x, axis=0)

print(train_x_std[0])

[ 7.4     0.7     0.      1.9     0.076  11.     34.      0.9978  3.51
  0.56    9.4   ]
[-0.68604493  1.0223137  -1.5007055  -0.5221455  -0.27282238 -0.42211187
 -0.37999126  0.40999055  1.3628328  -0.6086837  -0.9083458 ]


In [260]:
print(test_x[0])

test_x_std = (test_x - np.mean(train_x, axis=0)) / np.std(train_x, axis=0)

print(test_x_std[0])

[ 7.6      0.43     0.29     2.1      0.075   19.      66.       0.99718
  3.4      0.64     9.5    ]
[-0.57341063 -0.4893766  -0.02046837 -0.36273965 -0.29374978  0.3597589
  0.56779927  0.07982707  0.6549689  -0.15118532 -0.81657326]


## 1.2 MinMax Scaling

In [323]:
print(train_x[0])

train_x_mm = (train_x - np.min(train_x, axis=0)) / (np.max(train_x, axis=0) - np.min(train_x, axis=0))

print(train_x_mm[0])

[ 7.4     0.7     0.      1.9     0.076  11.     34.      0.9978  3.51
  0.56    9.4   ]
[0.24778764 0.47933882 0.         0.06849315 0.10684474 0.14925373
 0.09893993 0.58872557 0.663793   0.13772455 0.15384616]


In [324]:
print(test_x[0])

test_x_mm = (test_x - np.min(train_x, axis=0)) / (np.max(train_x, axis=0) - np.min(train_x, axis=0))

print(test_x_mm[0])

[ 7.6      0.43     0.29     2.1      0.075   19.      66.       0.99718
  3.4      0.64     9.5    ]
[0.26548675 0.25619835 0.29       0.08219177 0.1051753  0.26865673
 0.21201414 0.54150516 0.56896555 0.18562873 0.16923083]


# 2. Model Init

In [308]:
class Regression():
    def __init__(self):
        print('Logistic Regression Model Create')
    # weight와 bias를 초기화해주는 함수
    def init_model(self):
        global weight, bias

        weight = np.random.normal(0, 0.005, [11, 1])
        bias = np.zeros([1])
    # 학습을 하는 함수
    def train(self, epoch_count, mb_size, report):
        step_count = self.arrange_data(mb_size)
        for epoch in range(epoch_count):
            losses, mses = [], []
            for n in range(step_count):
                train_x, train_y = self.get_train_data(mb_size, n)
                loss, mse = self.run_train(train_x, train_y)
                losses.append(loss)
                mses.append(mse)
            if report > 0 and (epoch + 1) % report == 0:
                print('Epoch {}: loss={:5.3f}, mse={:5.3f}'.format(epoch + 1, np.mean(losses), np.mean(mses)))

    # 배치 사이즈를 통해 미니 배치 개수를 정하는 함수
    def arrange_data(self, mb_size):
        global data, lablel_data, shuffle_map

        shuffle_map = np.arange(data.shape[0])
        np.random.shuffle(shuffle_map)

        step_count = int(data.shape[0] ) // mb_size

        return step_count

    # Train 데이터셋에서 shuffle map을 인덱스로 활용하여 epoch마다 랜덤하게 학습할 수 있도록 미니 배치 데이터셋을 생성한다.
    def get_train_data(self, mb_size, nth):
        global data, label_data, shuffle_map

        if nth == 0:
            np.random.shuffle(shuffle_map[:])

        label_data = np.reshape(label_data, (-1,1))
        train_data = data[shuffle_map[mb_size * nth:mb_size * (nth + 1)]]
        train_label = label_data[shuffle_map[mb_size * nth:mb_size * (nth + 1)]]

        return train_data, train_label

    # 테스트 데이터를 설정해주는 함수
    def get_test_data():
        global data, shuffle_map, test_begin_idx, output_cnt

        test_data = data[shuffle_map[test_begin_idx:]]

        return test_data[:, :-output_cnt], test_data[:, -output_cnt:]

    # 순전파부터 손실함수, 정확도를 구하고 역전파를 통해 weight와 bias를 업데이트하는 함수
    def run_train(self, x, y):
        output, aux_nn = self.forward_neuralnet(x)
        loss, aux_pp = self.forward_postproc(output, y)
        mse = self.evaluate(output, y)

        G_loss = 1.0
        G_output = self.backprop_postproc(G_loss, aux_pp)
        self.backprop_neuralnet(G_output, aux_nn)

        return loss, mse

    # 테스트를 실행해주는 함수
    def run_test(self, x, y):
        output, _ = forward_neuralnet(x)
        accuracy = eval_accuracy(output, y)

        return accuracy

    # 순전파를 처리하는 함수로 y=wx + b
    def forward_neuralnet(self, x):
        global weight, bias
        
        output = np.matmul(x, weight) + bias

        return output, x

    # loss를 계산하는 함수
    def forward_postproc(self, output, y):
        diff = output - y
        square = np.square(diff)
        loss = np.mean(square)

        return loss, diff

    # weight와 bias를 업데이트
    def backprop_neuralnet(self, G_output, x):
        global weight, bias
        
        g_output_w = x.transpose()
        G_w = np.matmul(g_output_w, G_output)
        G_b = np.sum(G_output, axis=0)
        weight -= learningrate * G_w
        bias -= learningrate * G_b

    # Output의 손실기울기값을 구하는 함수
    def backprop_postproc(self, G_loss, diff):
        shape = diff.shape

        g_loss_square = np.ones(shape) / np.prod(shape)
        g_square_diff = 2 * diff
        g_diff_output = 1

        G_square = g_loss_square * G_loss
        G_diff = g_square_diff * G_square
        G_output = g_diff_output * G_diff

        return G_output
    
    # MSE를 활용한 평가 
    def evaluate(self, output, y):
        mse = np.sum((y - output)**2) / len(output)

        return mse

    # 학습을 시작하는 함수
    def start(self, epoch_count=50, mb_size=150, report=1, dataset=[], label=[]):
        global data, label_data

        data = dataset
        label_data = label

        self.init_model()
        self.train(epoch_count, mb_size, report)

    def return_param(self):
        return weight, bias

# 3. Training

In [309]:
np.random.seed(1234)
learningrate = 0.0005
epoch = 5000
batch_size = 1000
report = 100

In [310]:
LR = Regression()
LR.start(epoch, batch_size, report, train_x_std, train_y)

Logistic Regression Model Create
Epoch 100: loss=26.966, mse=26.966
Epoch 200: loss=22.056, mse=22.056
Epoch 300: loss=18.200, mse=18.200
Epoch 400: loss=15.039, mse=15.039
Epoch 500: loss=12.392, mse=12.392
Epoch 600: loss=10.145, mse=10.145
Epoch 700: loss=8.327, mse=8.327
Epoch 800: loss=6.991, mse=6.991
Epoch 900: loss=5.747, mse=5.747
Epoch 1000: loss=4.780, mse=4.780
Epoch 1100: loss=4.071, mse=4.071
Epoch 1200: loss=3.401, mse=3.401
Epoch 1300: loss=2.802, mse=2.802
Epoch 1400: loss=2.368, mse=2.368
Epoch 1500: loss=2.081, mse=2.081
Epoch 1600: loss=1.686, mse=1.686
Epoch 1700: loss=1.451, mse=1.451
Epoch 1800: loss=1.281, mse=1.281
Epoch 1900: loss=1.128, mse=1.128
Epoch 2000: loss=1.027, mse=1.027
Epoch 2100: loss=0.871, mse=0.871
Epoch 2200: loss=0.819, mse=0.819
Epoch 2300: loss=0.722, mse=0.722
Epoch 2400: loss=0.677, mse=0.677
Epoch 2500: loss=0.620, mse=0.620
Epoch 2600: loss=0.582, mse=0.582
Epoch 2700: loss=0.560, mse=0.560
Epoch 2800: loss=0.532, mse=0.532
Epoch 2900: 

# 4. Testset Evaluate

In [311]:
trained_weight, trained_bias = LR.return_param()
print(trained_weight, trained_bias)

[[ 0.06474432]
 [-0.17244627]
 [-0.00340037]
 [ 0.04602996]
 [-0.08642884]
 [ 0.02651111]
 [-0.12867671]
 [-0.09104238]
 [-0.02458762]
 [ 0.13889926]
 [ 0.27652147]] [5.63630294]


In [313]:
predictions = []
for idx in range(len(test_x_std)):
    predict = np.matmul(test_x_std[idx], trained_weight) + trained_bias
    predictions.append(predict[0])
predictions = np.array(predictions)

In [314]:
mse = np.sum((test_y - predictions)**2) / len(predictions)
rmse = np.sqrt(mse)
mae = np.sum(np.abs(test_y - predictions)) / len(predictions)
mape = mae * 100
print(mse, rmse, mae, mape)

0.4619600607212303 0.6796764382566386 0.5100464586475405 51.00464586475405


MinMax

In [330]:
learningrate = 0.001
epoch = 50000
batch_size = 1000
report = 1000

In [331]:
LR = Regression()
LR.start(epoch, batch_size, report, train_x_mm, train_y)

Logistic Regression Model Create
Epoch 1000: loss=0.776, mse=0.776
Epoch 2000: loss=0.734, mse=0.734
Epoch 3000: loss=0.667, mse=0.667
Epoch 4000: loss=0.662, mse=0.662
Epoch 5000: loss=0.656, mse=0.656
Epoch 6000: loss=0.592, mse=0.592
Epoch 7000: loss=0.604, mse=0.604
Epoch 8000: loss=0.570, mse=0.570
Epoch 9000: loss=0.581, mse=0.581
Epoch 10000: loss=0.541, mse=0.541
Epoch 11000: loss=0.540, mse=0.540
Epoch 12000: loss=0.553, mse=0.553
Epoch 13000: loss=0.511, mse=0.511
Epoch 14000: loss=0.515, mse=0.515
Epoch 15000: loss=0.519, mse=0.519
Epoch 16000: loss=0.493, mse=0.493
Epoch 17000: loss=0.524, mse=0.524
Epoch 18000: loss=0.513, mse=0.513
Epoch 19000: loss=0.484, mse=0.484
Epoch 20000: loss=0.474, mse=0.474
Epoch 21000: loss=0.480, mse=0.480
Epoch 22000: loss=0.496, mse=0.496
Epoch 23000: loss=0.491, mse=0.491
Epoch 24000: loss=0.474, mse=0.474
Epoch 25000: loss=0.484, mse=0.484
Epoch 26000: loss=0.464, mse=0.464
Epoch 27000: loss=0.483, mse=0.483
Epoch 28000: loss=0.464, mse=0.

In [332]:
trained_weight, trained_bias = LR.return_param()
print(trained_weight, trained_bias)

[[ 0.76197825]
 [-0.66820886]
 [ 0.39257592]
 [ 0.05230968]
 [-0.07431078]
 [ 0.09000612]
 [-0.53036151]
 [ 0.00585235]
 [ 1.03607366]
 [ 1.02441618]
 [ 2.16745029]] [4.18753271]


In [333]:
predictions = []
for idx in range(len(test_x_mm)):
    predict = np.matmul(test_x_mm[idx], trained_weight) + trained_bias
    predictions.append(predict[0])
predictions = np.array(predictions)

In [334]:
mse = np.sum((test_y - predictions)**2) / len(predictions)
rmse = np.sqrt(mse)
mae = np.sum(np.abs(test_y - predictions)) / len(predictions)
mape = mae * 100
print(mse, rmse, mae, mape)

0.4964977568487878 0.7046259694680489 0.5237623574220684 52.376235742206845


Result  

Best Custom Regression Model MSE : 0.4620  
Best Preprocessing : Standard Scaling