# 5차시: 피트니스 데이터 선형회귀 분석

## 2023. 06. 21. 14:10 ~ 16:00 (50분×2)
1. 선형회귀 기초 이론 (Linear regression)
1. 당뇨 데이터셋
1. 자동차 연비 데이터셋 (Auto MPG)

### 참고자료
- [파이썬 3 표준 문서](https://docs.python.org/3/index.html)
- [Scikit learn Linear regression](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html)
- [Diabetes dataset](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html)
- [TensorFlow Linear regression](https://www.tensorflow.org/tutorials/keras/regression)

### 당뇨병 (Diabetes) 데이터셋

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
diabetes = datasets.load_diabetes()
print(diabetes.DESCR)

In [None]:
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df

In [None]:
df['y'] = diabetes.target
df

In [None]:
scatter_matrix(df[df.columns],
               c=df['y'],
               alpha=0.5,
               figsize=(7, 7),)
print('') # Slient

In [None]:
features = ['age']
#features = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X = df[features]
y = df['y']

In [None]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
print(f'학습에 사용할 피처 크기: {X_train.shape}')
print(f'예측에 사용할 피처 크기: {X_test.shape}')

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr

In [None]:
y_pred = regr.predict(X_test)

print('Coefficients: \n', regr.coef_)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.scatter(X_test, y_test, color='black')
ax.plot(X_test, y_pred, color='blue', linewidth=3)

ax.set_title(f'Diabetes progression by {features}', fontsize='x-large')
ax.set_xlabel(f'{features}', fontsize='large')
ax.set_ylabel('Diabetes progression', fontsize='large')

In [None]:
regr.coef_

In [None]:
regr.intercept_

### Auto MPG
#### Multiple regression

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)
df = raw_dataset.copy()
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

In [None]:
scatter_matrix(df[df.columns],
               c=df['MPG'],
               alpha=0.5,
               figsize=(7, 7),)
print('') # Slient

In [None]:
df['Origin'] = df['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
df

In [None]:
df = pd.get_dummies(df, columns=['Origin'], prefix='', prefix_sep='')
df

In [None]:
features = list(df.columns)
features.remove('MPG')
X = df[features]
y = df['MPG']

In [None]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
print(f'학습에 사용할 피처 크기: {X_train.shape}')
print(f'예측에 사용할 피처 크기: {X_test.shape}')

In [None]:
X_train.describe().transpose()

In [None]:
feature_value = np.array(df[features])

feature_normalizer = tf.keras.layers.Normalization(input_shape=[len(features),], axis=None)
feature_normalizer.adapt(feature_value)

In [None]:
model = tf.keras.Sequential([
    feature_normalizer,
    tf.keras.layers.Dense(units=1)
])

model.summary()

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')