In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import sklearn
from tqdm import tqdm

# 0. 데이터셋 재구성 (train/test set)

In [2]:
# raw data load & convert to numpy
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
data = pd.DataFrame(data, columns= ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','RTRATIO','B','LSTAT'])
target = raw_df.values[1::2, 2]
target = pd.DataFrame(target, columns=['MEDV'])

In [3]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,RTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
target.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [7]:
print(data.shape)
print(target.shape)

(506, 13)
(506, 1)


In [5]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets.
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(404, 13)
(404, 1)
(102, 13)
(102, 1)


In [11]:
from sklearn.preprocessing import StandardScaler

# scaling data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# 1. 최소제곱법을 이용한 선형 회귀 모델 구현

### Using sklearn Linear Regression Model

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [20]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression(fit_intercept=True)
lr_model.fit(X_train, y_train)
print(lr_model.intercept_)
print(lr_model.coef_)

[30.24675099]
[[-1.13055924e-01  3.01104641e-02  4.03807204e-02  2.78443820e+00
  -1.72026334e+01  4.43883520e+00 -6.29636221e-03 -1.44786537e+00
   2.62429736e-01 -1.06467863e-02 -9.15456240e-01  1.23513347e-02
  -5.08571424e-01]]


In [13]:
# 모델 해석 관련
import statsmodels.api as sm

# Add a constant term to the training data.
X_train_sm = sm.add_constant(X_train)

# Create a linear regression model using statsmodels.
lr_model_sm = sm.OLS(y_train, X_train_sm).fit()

# Print the model summary.
print(lr_model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.743
Method:                 Least Squares   F-statistic:                     90.43
Date:                Sun, 07 Apr 2024   Prob (F-statistic):          6.21e-109
Time:                        13:53:29   Log-Likelihood:                -1194.3
No. Observations:                 404   AIC:                             2417.
Df Residuals:                     390   BIC:                             2473.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         22.7965      0.236     96.774      0.0

# 2. torch를 활용하여 Linear Layer(FC Layer)를 이용한 회귀 모델 구현

### Vanila Training Version

In [7]:
# model define
class MyRegressionModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.hidden_size = input_size // 2
        self.fc1 = nn.Linear(in_features=input_size, out_features=self.hidden_size) # input_size = 13, hidden_size=6
        self.fc2 = nn.Linear(in_features=self.hidden_size, out_features=1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

model = MyRegressionModel(13) # data.shape[1] = 13

In [8]:
# Define the loss function and optimizer.
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [9]:
# Convert data and target to tensors.
data_tensor = torch.tensor(np.array(data), dtype=torch.float32)
target_tensor = torch.tensor(np.array(target), dtype=torch.float32)

# Create a dataset from the tensors.
dataset = torch.utils.data.TensorDataset(data_tensor, target_tensor)

# Split the dataset into train and test sets.
test_size = 0.2
test_size = int(np.round(test_size*len(dataset)))
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [len(dataset)-test_size, test_size])

In [17]:
# Train the model.
for epoch in tqdm(range(100)):
    for (x, y) in train_dataset:
        # Forward pass.
        y_pred = model(x)

        # Compute the loss. (feed forward)
        loss = loss_fn(y_pred, y)

        # Backward pass.
        optimizer.zero_grad()
        loss.backward()

        # Update the parameters.
        optimizer.step()

100%|██████████| 100/100 [00:55<00:00,  1.80it/s]


### Batch Training Version

In [18]:
from torch.utils.data import DataLoader

# Create a data loader for the train set.
batch_size = 10
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Train the model.
for epoch in tqdm(range(100)):
    for (x, y) in train_loader:
        # Forward pass.
        y_pred = model(x)

        # Compute the loss.
        loss = loss_fn(y_pred, y)

        # Backward pass.
        optimizer.zero_grad()
        loss.backward()

        # Update the parameters.
        optimizer.step()

100%|██████████| 100/100 [00:08<00:00, 11.16it/s]


### Using GPU

In [18]:
# Move the model to the GPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Move the dataset to the GPU.
train_dataset.tensors = [tensor.to(device) for tensor in train_dataset.dataset.tensors]
test_dataset.tensors = [tensor.to(device) for tensor in test_dataset.dataset.tensors]

# Create a data loader for the train set.
batch_size = 10
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Train the model.
for epoch in tqdm(range(100)):
    for (x, y) in train_loader:
        # Forward pass.
        y_pred = model(x.to(device))

        # Compute the loss.
        loss = loss_fn(y_pred, y.to(device))

        # Backward pass.
        optimizer.zero_grad()
        loss.backward()

        # Update the parameters.
        optimizer.step()

100%|██████████| 100/100 [00:09<00:00, 10.17it/s]


# 3. 모델 평가

# Linear Regression model

In [21]:
from sklearn.metrics import mean_squared_error

lr_y_pred = lr_model.predict(X_test)
np.sqrt(mean_squared_error(lr_y_pred, y_test)) # RMSE

4.928602182665323

### FC Layer Regression Model

In [25]:
# Evaluate the model.
model.eval() # == with torch.no_grad():
y_pred = []
nn_y_test = []
for (x, y) in test_dataset:
    temp_y = model(x.to(device)).detach().cpu().numpy()
    y_pred.append(temp_y)
    nn_y_test.append(y)

np.sqrt(mean_squared_error(y_pred, nn_y_test))

7.663999