# Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('penguins.csv',index_col=0)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,,,,,,2007
5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [3]:
# Shape of data
df.shape

(344, 8)

In [4]:
# 결측치 확인
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [5]:
# 결측치 제거
df = df.dropna().reset_index(drop=True)

In [6]:
# 결측치 확인
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

연도 변수는 독립 변수에서 제외

In [7]:
df = df.iloc[:,:-1]
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male


In [8]:
df.shape

(333, 7)

In [9]:
# Convert categorical variable into dummy

island_dummies = pd.get_dummies(df.island)
sex_dummies = pd.get_dummies(df.sex)

In [10]:
island_dummies.head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


## Concatenate dataframe

In [11]:
df = pd.concat([df,island_dummies,sex_dummies],axis=1)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Biscoe,Dream,Torgersen,female,male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,0,0,1,0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,0,0,1,1,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,0,0,1,1,0
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,0,0,1,1,0
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,0,0,1,0,1


In [12]:
df = df.drop(['island','sex'],axis=1)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,female,male
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,Adelie,36.7,19.3,193.0,3450.0,0,0,1,1,0
4,Adelie,39.3,20.6,190.0,3650.0,0,0,1,0,1


In [13]:
df.shape

(333, 10)

## Labels

In [14]:
df.species.value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

In [15]:
# 문자열 형태를 정수형으로 변환
def make_int(s):
  if s == 'Adelie':
    return 0
  elif s == 'Gentoo':
    return 1
  else:
    return 2

In [16]:
df['species'] = df.species.apply(make_int)

In [17]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,female,male
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,0,36.7,19.3,193.0,3450.0,0,0,1,1,0
4,0,39.3,20.6,190.0,3650.0,0,0,1,0,1


나중에 one-hot encoding을 위해서는 정수형으로 바꿔주어야 함

# Holdout

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf

In [19]:
np.set_printoptions(suppress=True)

In [20]:
x = df.iloc[:,1:].values
y = df.iloc[:,0]

In [21]:
x[0]

array([  39.1,   18.7,  181. , 3750. ,    0. ,    0. ,    1. ,    0. ,
          1. ])

In [22]:
y[0]

0

Pytorch는 별도의 one-hot encoding이 필요 없음

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1,shuffle=True)
x_train, x_valid, y_train, y_valid = train_test_split(x_train,y_train,test_size=0.1,shuffle=True)

In [24]:
print('Shape of x train : ',x_train.shape)
print('Shape of x test : ',x_test.shape)
print('Shape of x valid : ',x_valid.shape)

print('Shape of y train : ',y_train.shape)
print('Shape of y valid : ',y_valid.shape)
print('Shape of y test : ',y_test.shape)

Shape of x train :  (269, 9)
Shape of x test :  (34, 9)
Shape of x valid :  (30, 9)
Shape of y train :  (269,)
Shape of y valid :  (30,)
Shape of y test :  (34,)


In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaler = StandardScaler()

In [27]:
x_train[:,:-5]

array([[  39.5,   16.7,  178. , 3250. ],
       [  34. ,   17.1,  185. , 3400. ],
       [  35.6,   17.5,  191. , 3175. ],
       ...,
       [  36.4,   17. ,  195. , 3325. ],
       [  43.3,   14. ,  208. , 4575. ],
       [  50. ,   16.3,  230. , 5700. ]])

In [28]:
scaler.fit(x_train[:,:-5])

In [29]:
x_train_std = scaler.transform(x_train[:,:-5])
x_valid_std = scaler.transform(x_valid[:,:-5])
x_test_std = scaler.transform(x_test[:,:-5])

In [30]:
x_train_std[0]

array([-0.81456392, -0.24321791, -1.61695002, -1.19338417])

In [31]:
x_train_std = np.concatenate([x_train_std,x_train[:,-5:]],axis=1)
x_valid_std = np.concatenate([x_valid_std,x_valid[:,-5:]],axis=1)
x_test_std = np.concatenate([x_test_std,x_test[:,-5:]],axis=1)

In [32]:
x_train_std[0]

array([-0.81456392, -0.24321791, -1.61695002, -1.19338417,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ])

In [33]:
x_valid_std[0]

array([ 0.83197466, -1.10474814,  1.29034404,  0.80091809,  1.        ,
        0.        ,  0.        ,  1.        ,  0.        ])

In [34]:
x_test_std[0]

array([-1.21705113, -0.19253967, -0.62421546, -1.5049939 ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ])

# Torch dataset

## Tensor dataset

Pytorch에서 가중치 학습은 실수형으로 진행됨. 그래서 모든 변수를 실수형으로 바꿔주어야 함.

그리고 ndarray와 호환이 안되기 때문에 torch tensor type으로 변환 필요

## Custom dataset

# 모형

# 학습