# Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Coding_study/data/penguins.csv',index_col=0)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,,,,,,2007
5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [3]:
# Shape of data
df.shape

(344, 8)

In [4]:
# 결측치 확인
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [5]:
# 결측치 제거
df = df.dropna().reset_index(drop=True)

In [6]:
# 결측치 확인
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

연도 변수는 독립 변수에서 제외

In [7]:
df = df.iloc[:,:-1]
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male


In [8]:
df.shape

(333, 7)

In [9]:
# Convert categorical variable into dummy

island_dummies = pd.get_dummies(df.island)
sex_dummies = pd.get_dummies(df.sex)

In [10]:
island_dummies.head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


## Concatenate dataframe

In [11]:
df = pd.concat([df,island_dummies,sex_dummies],axis=1)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Biscoe,Dream,Torgersen,female,male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,0,0,1,0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,0,0,1,1,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,0,0,1,1,0
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,0,0,1,1,0
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,0,0,1,0,1


In [12]:
df = df.drop(['island','sex'],axis=1)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,female,male
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,Adelie,36.7,19.3,193.0,3450.0,0,0,1,1,0
4,Adelie,39.3,20.6,190.0,3650.0,0,0,1,0,1


In [13]:
df.shape

(333, 10)

## Labels

In [14]:
df.species.value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

In [15]:
# 문자열 형태를 정수형으로 변환
def make_int(s):
  if s == 'Adelie':
    return 0
  elif s == 'Gentoo':
    return 1
  else:
    return 2

In [16]:
df['species'] = df.species.apply(make_int)

In [17]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,female,male
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,0,36.7,19.3,193.0,3450.0,0,0,1,1,0
4,0,39.3,20.6,190.0,3650.0,0,0,1,0,1


나중에 one-hot encoding을 위해서는 정수형으로 바꿔주어야 함

# Holdout

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf

In [19]:
np.set_printoptions(suppress=True)

In [20]:
x = df.iloc[:,1:].values
y = df.iloc[:,0]

In [21]:
x[0]

array([  39.1,   18.7,  181. , 3750. ,    0. ,    0. ,    1. ,    0. ,
          1. ])

In [22]:
y[0]

0

Pytorch는 별도의 one-hot encoding이 필요 없음

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1,shuffle=True)
x_train, x_valid, y_train, y_valid = train_test_split(x_train,y_train,test_size=0.1,shuffle=True)

In [24]:
print('Shape of x train : ',x_train.shape)
print('Shape of x test : ',x_test.shape)
print('Shape of x valid : ',x_valid.shape)

print('Shape of y train : ',y_train.shape)
print('Shape of y valid : ',y_valid.shape)
print('Shape of y test : ',y_test.shape)

Shape of x train :  (269, 9)
Shape of x test :  (34, 9)
Shape of x valid :  (30, 9)
Shape of y train :  (269,)
Shape of y valid :  (30,)
Shape of y test :  (34,)


In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaler = StandardScaler()

In [27]:
x_train[:,:-5]

array([[  43.5,   15.2,  213. , 4650. ],
       [  49.6,   15. ,  216. , 4750. ],
       [  47.4,   14.6,  212. , 4725. ],
       ...,
       [  37.9,   18.6,  172. , 3150. ],
       [  40.9,   16.6,  187. , 3200. ],
       [  40.6,   19. ,  199. , 4000. ]])

In [28]:
scaler.fit(x_train[:,:-5])

In [29]:
x_train_std = scaler.transform(x_train[:,:-5])
x_valid_std = scaler.transform(x_valid[:,:-5])
x_test_std = scaler.transform(x_test[:,:-5])

In [30]:
x_train_std[0]

array([-0.07909521, -0.99582395,  0.86179859,  0.58379985])

In [31]:
x_train_std = np.concatenate([x_train_std,x_train[:,-5:]],axis=1)
x_valid_std = np.concatenate([x_valid_std,x_valid[:,-5:]],axis=1)
x_test_std = np.concatenate([x_test_std,x_test[:,-5:]],axis=1)

In [32]:
x_train_std[0]

array([-0.07909521, -0.99582395,  0.86179859,  0.58379985,  1.        ,
        0.        ,  0.        ,  1.        ,  0.        ])

In [33]:
x_valid_std[0]

array([-1.77479831, -0.01515801, -1.13729811, -0.97632492,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ])

In [34]:
x_test_std[0]

array([ 0.02800183,  1.3268059 , -0.35193869,  0.27177489,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ])

# Torch dataset

In [35]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset

## Tensor dataset

Pytorch에서 가중치 학습은 실수형으로 진행됨. 그래서 모든 변수를 실수형으로 바꿔주어야 함.

그리고 ndarray와 호환이 안되기 때문에 torch tensor type으로 변환 필요

In [36]:
x_train_std.dtype

dtype('float64')

In [37]:
x_train_std = torch.tensor(x_train_std,dtype=torch.float32)
x_valid_std = torch.tensor(x_valid_std,dtype=torch.float32)
x_test_std = torch.tensor(x_test_std,dtype=torch.float32)

In [38]:
x_train_std[0]

tensor([-0.0791, -0.9958,  0.8618,  0.5838,  1.0000,  0.0000,  0.0000,  1.0000,
         0.0000])

In [39]:
y_train

255    1
188    1
226    1
270    2
234    1
      ..
143    0
185    1
23     0
295    2
119    0
Name: species, Length: 269, dtype: int64

In [40]:
y_train = torch.tensor(y_train.values).long()
y_valid = torch.tensor(y_valid.values).long()
y_test = torch.tensor(y_test.values).long()

In [41]:
y_train[0]

tensor(1)

종속변수는 long type으로 변환해주면 됨

In [42]:
# Make dataset
train_dataset = TensorDataset(x_train_std,y_train)
valid_dataset = TensorDataset(x_valid_std,y_valid)
test_dataset = TensorDataset(x_test_std,y_test)

In [43]:
# Make dataloader
trainloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
validloader = DataLoader(valid_dataset,batch_size=4)
testloader = DataLoader(test_dataset,batch_size=4)

In [44]:
# Check data
for x,y in trainloader:
  print(x.shape)
  print()
  print(y.shape)
  break

torch.Size([32, 9])

torch.Size([32])


## Custom dataset

In [45]:
class CustomDataset(Dataset):

  def __init__(self,x,y):
    self.x = x
    self.y = y.values

  def __len__(self):
    return len(self.x)

  def __getitem__(self,idx):
    x_value = torch.tensor(self.x[idx],dtype=torch.float32)
    y_value = torch.tensor(self.y[idx]).long()
    return x_value,y_value

In [None]:
y_train

141    0
16     0
98     0
166    1
309    2
      ..
142    0
286    2
118    0
58     0
322    2
Name: species, Length: 269, dtype: int64

In [None]:
train_dataset = CustomDataset(x_train_std, y_train)
valid_dataset = CustomDataset(x_valid_std, y_valid)
test_dataset = CustomDataset(x_test_std, y_test)

In [None]:
# Make dataloader
trainloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
validloader = DataLoader(valid_dataset,batch_size=4)
testloader = DataLoader(test_dataset,batch_size=4)

In [None]:
# Check data
for x,y in trainloader:
  print(x.shape)
  print(y.shape)
  break

torch.Size([32, 9])
torch.Size([32])


# 모형

In [46]:
from torch import nn
from torch import optim

In [None]:
class MyModel(nn.Module):

  def __init__(self):
    #super(MyModel,self).__init__()
    super().__init__()
    self.linear1 = nn.Linear(9,256)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(256,128)
    self.dropout = nn.Dropout(0.2)
    self.out = nn.Linear(128,3)

  def forward(self,x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.linear2(x)
    x = self.relu(x)
    x = self.dropout(x)
    output = self.out(x)
    return output

In [None]:
# GPU 사용 확인
torch.cuda.is_available()

False

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
model = MyModel().to(device)
model

MyModel(
  (linear1): Linear(in_features=9, out_features=256, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=256, out_features=128, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (out): Linear(in_features=128, out_features=3, bias=True)
)

In [None]:
# 모형 요약
!pip install torchsummary



In [None]:
from torchsummary import summary

In [None]:
summary(model,input_size=(9,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 256]           2,560
              ReLU-2                  [-1, 256]               0
            Linear-3                  [-1, 128]          32,896
              ReLU-4                  [-1, 128]               0
           Dropout-5                  [-1, 128]               0
            Linear-6                    [-1, 3]             387
Total params: 35,843
Trainable params: 35,843
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.14
Estimated Total Size (MB): 0.14
----------------------------------------------------------------


In [None]:
# loss function & opimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)

# 학습

In [None]:
len(trainloader.dataset)

269

In [None]:
for x, y in trainloader:
  sx = x
  sy = y
  break

In [None]:
sx.shape, y.shape

(torch.Size([32, 9]), torch.Size([32]))

In [None]:
sx.shape

torch.Size([32, 9])

In [None]:
sx = sx.to(device)
sy = sy.to(device)

In [None]:
output = model(sx)
output.shape

torch.Size([32, 3])

In [None]:
output[0]

tensor([-0.0523,  0.1433, -0.0633], grad_fn=<SelectBackward0>)

In [None]:
criterion(output,sy)

tensor(1.1165, grad_fn=<NllLossBackward0>)

In [None]:
def train(model, dataloader, loss_fn,optimizer, epoch):
  # model.train()
  print(f'Epoch : {epoch}')
  size = len(dataloader.dataset)
  total_batch = len(dataloader)
  running_loss = 0
  for batch, (X, y) in enumerate(dataloader):
    X = X.to(device)
    y = y.to(device)

    output = model(X)
    loss = loss_fn(output, y)

    optimizer.zero_grad() # 이전 경사 값들의 정보를 날림
    loss.backward()
    optimizer.step()

    print(f'Train loss : {loss.item():>7f} [{batch*len(X)}/{size}]')
    running_loss += loss.item()
  print(f'Average Train loss : {running_loss/total_batch}\n')
  return running_loss

In [None]:
for epoch in range(100):
  train(model,trainloader,criterion,optimizer,epoch)

Epoch : 0
Train loss : 1.104169 [0/269]
Train loss : 1.026564 [32/269]
Train loss : 0.956983 [64/269]
Train loss : 0.897887 [96/269]
Train loss : 0.808815 [128/269]
Train loss : 0.749360 [160/269]
Train loss : 0.659072 [192/269]
Train loss : 0.669572 [224/269]
Train loss : 0.591198 [104/269]
Average Train loss : 0.8292910522884793

Epoch : 1
Train loss : 0.562474 [0/269]
Train loss : 0.482137 [32/269]
Train loss : 0.429296 [64/269]
Train loss : 0.374879 [96/269]
Train loss : 0.368435 [128/269]
Train loss : 0.394218 [160/269]
Train loss : 0.298432 [192/269]
Train loss : 0.266939 [224/269]
Train loss : 0.252489 [104/269]
Average Train loss : 0.3810332218805949

Epoch : 2
Train loss : 0.218660 [0/269]
Train loss : 0.202345 [32/269]
Train loss : 0.150400 [64/269]
Train loss : 0.145759 [96/269]
Train loss : 0.163441 [128/269]
Train loss : 0.124786 [160/269]
Train loss : 0.144712 [192/269]
Train loss : 0.069242 [224/269]
Train loss : 0.086912 [104/269]
Average Train loss : 0.1451395650704701

In [None]:
output = model(sx)

In [None]:
output[0]

tensor([11.2951, -4.7405, -9.1677], grad_fn=<SelectBackward0>)

In [None]:
torch.argmax(output[0])

tensor(0)

In [None]:
y[0]

tensor(0)

In [None]:
output.argmax(1)

tensor([0, 2, 0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 2, 1, 2, 0, 0, 2, 1, 1, 0, 2, 2, 0,
        1, 0, 0, 2, 0, 1, 1, 1])

In [None]:
output.argmax(1) == sy

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])

In [None]:
(output.argmax(1) == sy).sum().item()

32

In [None]:
def test(model, dataloader, loss_fn,epoch):
  total_batch = len(dataloader)
  size = len(dataloader.dataset)
  model.eval()
  test_loss, correct = 0, 0
  # 경사 계산 X
  with torch.no_grad():

    for X,y in dataloader:
      X = X.to(device)
      y = y.to(device)

      preds = model(X)
      loss = loss_fn(preds,y)
      correct += (preds.argmax(1) == y).sum().item()
      test_loss += loss.item()

    test_loss /= total_batch
    correct /= size

  print(f'Epoch : {epoch} Test Loss : {test_loss}, Test Accuracy : {correct}')

  return test_loss


In [None]:
test(model, validloader, criterion,1)

Epoch : 1 Test Loss : 1.331741295373945e-05, Test Accuracy : 1.0


1.331741295373945e-05

In [None]:
import copy

In [None]:
best_loss = 99999
counter = 0
best_model_weight = copy.deepcopy(model.state_dict())
for epoch in range(1,101):
  train(model,trainloader,criterion,optimizer,epoch)
  current_loss = test(model, validloader, criterion,epoch)
  if best_loss >= current_loss:
    best_loss = current_loss
    counter += 1
    best_model_wts = copy.deepcopy(model.state_dict())
    print(f"Saving model {epoch}")
    if counter == 3:
      break
  print('\n')

Epoch : 1
Train loss : 0.000216 [0/269]
Train loss : 0.000006 [32/269]
Train loss : 0.000156 [64/269]
Train loss : 0.000477 [96/269]
Train loss : 0.000179 [128/269]
Train loss : 0.000051 [160/269]
Train loss : 0.000023 [192/269]
Train loss : 0.000038 [224/269]
Train loss : 0.000022 [104/269]
Average Train loss : 0.00012966366239197669

Epoch : 1 Test Loss : 1.3060390479324724e-05, Test Accuracy : 1.0
Saving model 1


Epoch : 2
Train loss : 0.000590 [0/269]
Train loss : 0.000196 [32/269]
Train loss : 0.000028 [64/269]
Train loss : 0.000122 [96/269]
Train loss : 0.000026 [128/269]
Train loss : 0.000017 [160/269]
Train loss : 0.000101 [192/269]
Train loss : 0.000002 [224/269]
Train loss : 0.000013 [104/269]
Average Train loss : 0.0001215591659931508

Epoch : 2 Test Loss : 1.277356530238194e-05, Test Accuracy : 1.0
Saving model 2


Epoch : 3
Train loss : 0.000348 [0/269]
Train loss : 0.000353 [32/269]
Train loss : 0.000037 [64/269]
Train loss : 0.000004 [96/269]
Train loss : 0.000011 [128/

In [None]:
model.load_state_dict(best_model_weight)

<All keys matched successfully>

In [None]:
torch.save(model, 'pytorch_sample.ckpt')

In [None]:
model = torch.load('pytorch_sample.ckpt')

# 추론

In [None]:
def test(model, dataloader, loss_fn):
  total_batch = len(dataloader)
  size = len(dataloader.dataset)
  model.eval()
  test_loss, correct = 0, 0
  # 경사 계산 X
  with torch.no_grad():

    for X,y in dataloader:
      X = X.to(device)
      y = y.to(device)

      preds = model(X)
      loss = loss_fn(preds,y)
      correct += (preds.argmax(1) == y).sum().item()
      test_loss += loss.item()

    test_loss /= total_batch
    correct /= size

  print(f'Test Loss : {test_loss}, Test Accuracy : {correct}')

  return test_loss,correct


In [None]:
test(model,testloader,criterion)

Test Loss : 1.6001442882027403e-05, Test Accuracy : 1.0


(1.6001442882027403e-05, 1.0)

In [None]:
for x,y in testloader:
  tx = x
  ty = y
  break

In [None]:
model(tx).argmax(1)

tensor([1, 1, 1, 2])

# Pytorch ligtning

In [47]:
!pip install pytorch_lightning



In [48]:
import pytorch_lightning as pl

In [49]:
device = 'cpu'

In [50]:
class MyModel(pl.LightningModule):

  def __init__(self):
    #super(MyModel,self).__init__()
    super().__init__()
    self.linear1 = nn.Linear(9,256)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(256,128)
    self.dropout = nn.Dropout(0.2)
    self.out = nn.Linear(128,3)

  def forward(self,x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.linear2(x)
    x = self.relu(x)
    x = self.dropout(x)
    output = self.out(x)
    return output

  def training_step(self,batch,batch_idx):
    x,y = batch
    x = x.to(device)
    y = y.to(device)
    z = self.linear1(x)
    z = self.relu(z)
    z = self.linear2(z)
    z = self.relu(z)
    z = self.dropout(z)
    out = self.out(z)
    loss = nn.CrossEntropyLoss()(out,y)
    self.log('train_loss',loss)
    return loss

  def configure_optimizers(self):
    optimizer = optim.Adam(self.parameters(),lr=1e-3)
    return optimizer

In [53]:
model = MyModel().to(device)

In [54]:
trainer = pl.Trainer(max_epochs=100)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [55]:
trainer.fit(model,trainloader)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type    | Params
------------------------------------
0 | linear1 | Linear  | 2.6 K 
1 | relu    | ReLU    | 0     
2 | linear2 | Linear  | 32.9 K
3 | dropout | Dropout | 0     
4 | out     | Linear  | 387   
------------------------------------
35.8 K    Trainable params
0         Non-trainable params
35.8 K    Total params
0.143     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=100` reached.
