<a href="https://colab.research.google.com/github/nogood1014/2020-2-AI/blob/main/code/09W_2_reg_mpg_tuto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
dataset_path = keras.utils.get_file("auto-mpg.data", 
"http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
print(dataset_path)

/root/.keras/datasets/auto-mpg.data


In [None]:
#데이터 읽어 dataset에 저장
col_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'weight', 'Acceleration', \
            'Model Year', 'Origin']
raw_data = pd.read_csv(dataset_path, 
                       names=col_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True)
dataset = raw_data.copy()
dataset.tail(10)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,weight,Acceleration,Model Year,Origin
388,26.0,4,156.0,92.0,2585.0,14.5,82,1
389,22.0,6,232.0,112.0,2835.0,14.7,82,1
390,32.0,4,144.0,96.0,2665.0,13.9,82,3
391,36.0,4,135.0,84.0,2370.0,13.0,82,1
392,27.0,4,151.0,90.0,2950.0,17.3,82,1
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


In [None]:
dataset.shape

(398, 8)

In [None]:
# 데이터 정제, 비어있는 열의 행의 수 알아내기
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [None]:
# 비어 있는 열이 하나라도 있는 행을 제거
dataset = dataset.dropna()
dataset.shape

(392, 8)

In [None]:
# 열 'Origin'을 빼내 origin에 저장
origin = dataset.pop('Origin')
origin

KeyError: ignored

In [None]:
dataset

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,weight,Acceleration,Model Year
0,18.0,8,307.0,130.0,3504.0,12.0,70
1,15.0,8,350.0,165.0,3693.0,11.5,70
2,18.0,8,318.0,150.0,3436.0,11.0,70
3,16.0,8,304.0,150.0,3433.0,12.0,70
4,17.0,8,302.0,140.0,3449.0,10.5,70
...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82
394,44.0,4,97.0,52.0,2130.0,24.6,82
395,32.0,4,135.0,84.0,2295.0,11.6,82
396,28.0,4,120.0,79.0,2625.0,18.6,82


In [None]:
# "Origin" 열은 수치형이 아니고 범주형이므로 원-핫 인코딩으로 변환
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,weight,Acceleration,Model Year,USA,Europe,Japan
393,27.0,4,140.0,86.0,2790.0,15.6,82,1.0,0.0,0.0
394,44.0,4,97.0,52.0,2130.0,24.6,82,0.0,1.0,0.0
395,32.0,4,135.0,84.0,2295.0,11.6,82,1.0,0.0,0.0
396,28.0,4,120.0,79.0,2625.0,18.6,82,1.0,0.0,0.0
397,31.0,4,119.0,82.0,2720.0,19.4,82,1.0,0.0,0.0


In [None]:
# 데이터셋을 훈련 세트와 테스트 세트로 분할
# 전체 자료에서 80%를 훈련 데이터로 사용
train_dataset = dataset.sample(frac=0.8, random_state=0)
print(train_dataset)
# 전체 자료에서 나머지 20%를 테스트 데이터로 사용
test_dataset = dataset.drop(train_dataset.index)
print(test_dataset)

      MPG  Cylinders  Displacement  ...  weight  Acceleration  Model Year
146  28.0          4          90.0  ...  2125.0          14.5          74
282  22.3          4         140.0  ...  2890.0          17.3          79
69   12.0          8         350.0  ...  4456.0          13.5          72
378  38.0          4         105.0  ...  2125.0          14.7          82
331  33.8          4          97.0  ...  2145.0          18.0          80
..    ...        ...           ...  ...     ...           ...         ...
281  19.8          6         200.0  ...  2990.0          18.2          79
229  16.0          8         400.0  ...  4220.0          11.1          77
150  26.0          4         108.0  ...  2391.0          15.5          74
145  32.0          4          83.0  ...  2003.0          19.0          74
182  28.0          4         107.0  ...  2464.0          15.5          76

[314 rows x 7 columns]
      MPG  Cylinders  Displacement  ...  weight  Acceleration  Model Year
9    15.0     

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

print(train_dataset.shape, test_dataset.shape)

(314, 10) (78, 10)


In [None]:
#전반적인 통계도 확인
train_stats = train_dataset.describe()
print(train_stats)

              MPG   Cylinders  Displacement  ...         USA      Europe       Japan
count  314.000000  314.000000    314.000000  ...  314.000000  314.000000  314.000000
mean    23.310510    5.477707    195.318471  ...    0.624204    0.178344    0.197452
std      7.728652    1.699788    104.331589  ...    0.485101    0.383413    0.398712
min     10.000000    3.000000     68.000000  ...    0.000000    0.000000    0.000000
25%     17.000000    4.000000    105.500000  ...    0.000000    0.000000    0.000000
50%     22.000000    4.000000    151.000000  ...    1.000000    0.000000    0.000000
75%     28.950000    8.000000    265.750000  ...    1.000000    0.000000    0.000000
max     46.600000    8.000000    455.000000  ...    1.000000    1.000000    1.000000

[8 rows x 10 columns]


In [None]:
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

KeyError: ignored

In [None]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
normed_train_data.tail()

KeyError: ignored

In [None]:
def build_model():
    model = keras.Sequential([
            layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
            layers.Dense(64, activation='relu'),
            layers.Dense(1)
    ])
    optimizer = tf.keras.optimizers.RMSprop(0.001)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse'])
    return model

model = build_model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 64)                640       
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________


In [None]:
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
print(example_result)

NameError: ignored

In [None]:
# 에포크 중간 중간에 점(.)을 출력해 훈련 진행 과정을 표시
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
       if epoch % 100 == 0: print('')
       print('.', end='')

EPOCHS = 1000
history = model.fit(normed_train_data, train_labels, epochs = EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks = [PrintDot()])

NameError: ignored

In [None]:
test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal') # 각 축의 범위와 축의 스케일을 동일하게 설정
plt.axis('square') # 각 축의 범위를 xmax - xmin = ymax - ymin 이 되도록 설정
print(plt.xlim()[1])
plt.xlim([0, plt.xlim()[1]])
print(plt.ylim()[1])
plt.ylim([0, plt.ylim()[1]])
_=plt.plot([-100, 100], [-100, 100], c='.7')

NameError: ignored