# Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Coding_study/data/penguins.csv',index_col=0)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,,,,,,2007
5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [3]:
# Shape of data
df.shape

(344, 8)

In [4]:
# 결측치 확인
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [5]:
# 결측치 제거
df = df.dropna().reset_index(drop=True)

In [6]:
# 결측치 확인
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

연도 변수는 독립 변수에서 제외

In [7]:
df = df.iloc[:,:-1]
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male


In [8]:
df.shape

(333, 7)

In [9]:
# Convert categorical variable into dummy

island_dummies = pd.get_dummies(df.island)
sex_dummies = pd.get_dummies(df.sex)

In [10]:
island_dummies.head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


## Concatenate dataframe

In [11]:
df = pd.concat([df,island_dummies,sex_dummies],axis=1)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Biscoe,Dream,Torgersen,female,male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,0,0,1,0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,0,0,1,1,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,0,0,1,1,0
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,0,0,1,1,0
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,0,0,1,0,1


In [12]:
df = df.drop(['island','sex'],axis=1)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,female,male
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,Adelie,36.7,19.3,193.0,3450.0,0,0,1,1,0
4,Adelie,39.3,20.6,190.0,3650.0,0,0,1,0,1


In [13]:
df.shape

(333, 10)

## Labels

In [14]:
df.species.value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

In [15]:
# 문자열 형태를 정수형으로 변환
def make_int(s):
  if s == 'Adelie':
    return 0
  elif s == 'Gentoo':
    return 1
  else:
    return 2

In [16]:
df['species'] = df.species.apply(make_int)

In [17]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,female,male
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,0,36.7,19.3,193.0,3450.0,0,0,1,1,0
4,0,39.3,20.6,190.0,3650.0,0,0,1,0,1


나중에 one-hot encoding을 위해서는 정수형으로 바꿔주어야 함

# Holdout

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf

In [19]:
np.set_printoptions(suppress=True)

In [20]:
x = df.iloc[:,1:].values
y = df.iloc[:,0]

In [21]:
x[0]

array([  39.1,   18.7,  181. , 3750. ,    0. ,    0. ,    1. ,    0. ,
          1. ])

In [41]:
y = y.values
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

Pytorch는 별도의 one-hot encoding이 필요 없음

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1,shuffle=True)
x_train, x_valid, y_train, y_valid = train_test_split(x_train,y_train,test_size=0.1,shuffle=True)

In [43]:
print('Shape of x train : ',x_train.shape)
print('Shape of x test : ',x_test.shape)
print('Shape of x valid : ',x_valid.shape)

print('Shape of y train : ',y_train.shape)
print('Shape of y valid : ',y_valid.shape)
print('Shape of y test : ',y_test.shape)

Shape of x train :  (269, 9)
Shape of x test :  (34, 9)
Shape of x valid :  (30, 9)
Shape of y train :  (269,)
Shape of y valid :  (30,)
Shape of y test :  (34,)


In [44]:
from sklearn.preprocessing import StandardScaler

In [45]:
scaler = StandardScaler()

In [46]:
x_train[:,:-5]

array([[  38.1,   16.5,  198. , 3825. ],
       [  40.8,   18.9,  208. , 4300. ],
       [  44. ,   13.6,  208. , 4350. ],
       ...,
       [  39.6,   17.2,  196. , 3550. ],
       [  37.7,   19.8,  198. , 3500. ],
       [  49.2,   18.2,  195. , 4400. ]])

In [47]:
scaler.fit(x_train[:,:-5])

In [48]:
x_train_std = scaler.transform(x_train[:,:-5])
x_valid_std = scaler.transform(x_valid[:,:-5])
x_test_std = scaler.transform(x_test[:,:-5])

In [49]:
x_train_std[0]

array([-1.02115486, -0.33984644, -0.1825516 , -0.45379295])

In [50]:
x_train_std = np.concatenate([x_train_std,x_train[:,-5:]],axis=1)
x_valid_std = np.concatenate([x_valid_std,x_valid[:,-5:]],axis=1)
x_test_std = np.concatenate([x_test_std,x_test[:,-5:]],axis=1)

In [51]:
x_train_std[0]

array([-1.02115486, -0.33984644, -0.1825516 , -0.45379295,  1.        ,
        0.        ,  0.        ,  1.        ,  0.        ])

In [52]:
x_valid_std[0]

array([-0.74611582,  1.81238736, -0.68218047, -0.35974803,  1.        ,
        0.        ,  0.        ,  1.        ,  0.        ])

In [53]:
x_test_std[0]

array([-1.69958449,  0.63378313, -1.18180934, -1.08075906,  0.        ,
        0.        ,  1.        ,  1.        ,  0.        ])

In [54]:
y_train[0]

0

# Torch dataset

In [55]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader

## Tensor dataset

Pytorch에서 가중치 학습은 실수형으로 진행됨. 그래서 모든 변수를 실수형으로 바꿔주어야 함.

그리고 ndarray와 호환이 안되기 때문에 torch tensor type으로 변환 필요

In [56]:
x_train_tensor = torch.tensor(x_train_std,dtype=torch.float32)
x_valid_tensor = torch.tensor(x_valid_std,dtype=torch.float32)
x_test_tensor = torch.tensor(x_test_std,dtype=torch.float32)
x_train_tensor[0]

tensor([-1.0212, -0.3398, -0.1826, -0.4538,  1.0000,  0.0000,  0.0000,  1.0000,
         0.0000])

In [57]:
y_train_label = torch.LongTensor(y_train)
y_valid_label = torch.LongTensor(y_valid)
y_test_label = torch.LongTensor(y_test)

In [58]:
y_train_label[0]

tensor(0)

In [59]:
trainset1 = TensorDataset(x_train_tensor,y_train_label)

In [60]:
trainloader1 = DataLoader(trainset1,batch_size=32,shuffle=True)

In [61]:
next(iter(trainloader1))

[tensor([[ 1.5275,  1.8124,  0.6740,  0.7688,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000],
         [-0.3977,  0.5825, -0.3967,  0.1418,  0.0000,  0.0000,  1.0000,  0.0000,
           1.0000],
         [-1.4612,  0.4288,  0.1030, -0.7986,  0.0000,  1.0000,  0.0000,  1.0000,
           0.0000],
         [-1.4612, -0.0836, -0.8249, -1.0494,  0.0000,  0.0000,  1.0000,  1.0000,
           0.0000],
         [ 0.4274,  0.8900, -0.3967, -0.0463,  0.0000,  1.0000,  0.0000,  1.0000,
           0.0000],
         [ 1.4725,  1.6074, -0.4681, -0.5165,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000],
         [-0.0677, -1.9284,  0.6026,  0.2672,  1.0000,  0.0000,  0.0000,  1.0000,
           0.0000],
         [ 1.0325,  1.4024,  0.1743, -0.1717,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000],
         [-0.7278,  0.6338, -0.7536, -0.3597,  0.0000,  0.0000,  1.0000,  0.0000,
           1.0000],
         [-0.1410,  0.2238, -0.3253,  0.6434,  0.0000,  0.0000,  1.0000,  0.0000,


## Custom dataset

In [79]:
class MyDataset(Dataset):

  def __init__(self,x,y):
    self.x = x
    self.y = y

  def __len__(self):
    return len(self.x)

  def __getitem__(self,idx):
    a = torch.tensor(self.x[idx],dtype=torch.float32)
    b = torch.tensor(self.y[idx]).long()
    return a,b

In [80]:
trainset = MyDataset(x_train_std,y_train)
validset = MyDataset(x_valid_std,y_valid)
testset = MyDataset(x_test_std,y_test)

In [81]:
trainloader = DataLoader(trainset,batch_size=32,shuffle=True)
validloader = DataLoader(validset,batch_size=32)
testloader = DataLoader(testset,batch_size=32)

In [82]:
for x,y in trainloader:
  print(x.shape)
  print(y.shape)
  break

torch.Size([32, 9])
torch.Size([32])


# 모형

In [83]:
from torch import nn
from torch import optim

In [95]:
class MyModel(nn.Module):

  def __init__(self):
    super().__init__()
    # super(MyModel,self).__init__()
    self.linear1 = nn.Linear(9,256)
    self.linear2 = nn.Linear(256,128)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(0.4)
    self.output = nn.Linear(128,3)


  def forward(self,x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.linear2(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.output(x)
    return x

In [96]:
torch.cuda.is_available()

False

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [98]:
model = MyModel().to(device)

In [99]:
model

MyModel(
  (linear1): Linear(in_features=9, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=128, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.4, inplace=False)
  (output): Linear(in_features=128, out_features=3, bias=True)
)

# 학습

In [92]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=1e-4)

In [93]:
for x,y in trainloader:
  sx = x
  sy = y
  break

In [101]:
model(sx).shape

torch.Size([32, 3])

In [102]:
sy

tensor([1, 0, 2, 2, 0, 1, 0, 0, 0, 1, 2, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 0, 2, 2, 0, 2, 1, 1])

In [104]:
loss_fn(model(sx),sy).item()

1.1145153045654297

In [105]:
len(trainloader.dataset)

269

In [106]:
len(trainloader)

9

In [107]:
def train(model,dataloader,loss_fn,optimizer):

  model.train()
  size = len(trainloader.dataset)
  total_batch = len(trainloader)

  running_loss = 0

  for batch, (x,y) in enumerate(dataloader):
    x = x.to(device)
    y = y.to(device)
    preds = model(x)
    loss = loss_fn(preds,y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

    print(f'Loss : {loss.item()}\t [{batch*32} / {size}]')

  total_loss = running_loss / total_batch
  print('Final Loss : ',total_loss)

In [108]:
train(model,trainloader,loss_fn,optimizer)

Loss : 1.1233972311019897	 [0 / 269]
Loss : 1.1078494787216187	 [32 / 269]
Loss : 1.1323810815811157	 [64 / 269]
Loss : 1.1102967262268066	 [96 / 269]
Loss : 1.0950262546539307	 [128 / 269]
Loss : 1.1150360107421875	 [160 / 269]
Loss : 1.114436149597168	 [192 / 269]
Loss : 1.1449178457260132	 [224 / 269]
Loss : 1.1397725343704224	 [256 / 269]
Final Loss :  1.1203459236356947


In [109]:
sx

tensor([[ 1.4359, -0.4423,  2.1015,  1.6465,  1.0000,  0.0000,  0.0000,  0.0000,
          1.0000],
        [-0.5444, -0.0836, -0.7536, -0.5792,  0.0000,  1.0000,  0.0000,  0.0000,
          1.0000],
        [ 1.0691,  0.9412, -0.0398, -0.4851,  0.0000,  1.0000,  0.0000,  0.0000,
          1.0000],
        [ 1.5275,  0.4801,  0.0316, -0.1717,  0.0000,  1.0000,  0.0000,  0.0000,
          1.0000],
        [-1.1128,  1.0950, -1.3959, -1.1121,  0.0000,  1.0000,  0.0000,  1.0000,
          0.0000],
        [ 0.3724, -1.6722,  0.9595,  0.2672,  1.0000,  0.0000,  0.0000,  1.0000,
          0.0000],
        [-0.5261,  0.8900,  0.5312,  0.1418,  0.0000,  1.0000,  0.0000,  0.0000,
          1.0000],
        [-0.8195,  0.7363, -0.7536,  0.0791,  0.0000,  1.0000,  0.0000,  0.0000,
          1.0000],
        [-1.2229, -0.1349, -1.1104, -1.4883,  0.0000,  1.0000,  0.0000,  1.0000,
          0.0000],
        [-0.1777, -1.7747,  0.5312, -0.2971,  1.0000,  0.0000,  0.0000,  1.0000,
          0.0000],


In [110]:
sy

tensor([1, 0, 2, 2, 0, 1, 0, 0, 0, 1, 2, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 0, 2, 2, 0, 2, 1, 1])

In [117]:
(model(sx).argmax(1) == sy).sum().item() /32

0.34375