## 회귀 분석: 수치예측
- species: 펭귄 종류
- island: 섬(서식지)
- gender: 성별
- bill_length_mm: 부리의 길이
- bill_depth_mm: 부리의 높이
- flipper_length_mm: 팔 길이
- body_mass_g: 체중(g) 

In [29]:
import pandas as pd
import numpy as np

In [30]:
x_train = pd.read_csv('./data/penguin_x_train.csv')
y_train = pd.read_csv('./data/penguin_y_train.csv')
x_test = pd.read_csv('./data/penguin_x_test.csv')
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            240 non-null    object 
 1   island             240 non-null    object 
 2   gender             232 non-null    object 
 3   bill_length_mm     238 non-null    float64
 4   bill_depth_mm      238 non-null    float64
 5   flipper_length_mm  238 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.4+ KB


In [31]:
# 훈련 데이터가 독립변수와 종속변수가 분리되어 있음으로 병합하여 
# 삭제처리등을 해야함.
train = pd.concat([x_train, y_train], axis=1) # 컬럼단위 병합
train.isnull().sum()

species              0
island               0
gender               8
bill_length_mm       2
bill_depth_mm        2
flipper_length_mm    2
body_mass_g          2
dtype: int64

In [32]:
train.gender.isna()

0       True
1      False
2      False
3      False
4      False
       ...  
235    False
236    False
237    False
238    False
239    False
Name: gender, Length: 240, dtype: bool

In [33]:
# 결측치 관측치만 출력
cdt = (train.gender.isna()) | (train.bill_length_mm.isna()) | (train.bill_depth_mm.isna()) | (train.flipper_length_mm.isna()) | (train.body_mass_g.isna())
train[cdt]

Unnamed: 0,species,island,gender,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,,42.0,20.2,190.0,4250.0
6,Gentoo,Biscoe,,44.5,14.3,216.0,4100.0
43,Gentoo,Biscoe,,,,,
66,Adelie,Torgersen,,37.8,17.3,180.0,3700.0
88,Gentoo,Biscoe,,47.3,13.8,216.0,4725.0
89,Adelie,Torgersen,,37.8,17.1,186.0,3300.0
110,Gentoo,Biscoe,,44.5,15.7,217.0,4875.0
229,Adelie,Torgersen,,,,,


In [34]:
# 성별은 결측치를 대체하기 어려움으로 삭제한다.
train = train.dropna()
train.reset_index(drop=True, inplace=True)
train.isnull().sum()

species              0
island               0
gender               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
dtype: int64

In [35]:
x_train = train.iloc[:, 0:-1]
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            232 non-null    object 
 1   island             232 non-null    object 
 2   gender             232 non-null    object 
 3   bill_length_mm     232 non-null    float64
 4   bill_depth_mm      232 non-null    float64
 5   flipper_length_mm  232 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.0+ KB


In [36]:
# y_train = train[['body_mass_g']]
y_train = train.iloc[:, -1:]
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   body_mass_g  232 non-null    float64
dtypes: float64(1)
memory usage: 1.9 KB


In [37]:
x_train.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm
count,232.0,232.0,232.0
mean,43.990948,17.226293,200.681034
std,5.50976,1.964677,14.064231
min,32.1,13.2,172.0
25%,39.2,15.7,190.0
50%,44.95,17.35,197.0
75%,48.775,18.725,212.25
max,58.0,21.5,231.0


In [38]:
col_del = [] # 삭제할 컬럼
col_num = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'] # 수치형 변수
col_cat = ['species','island', 'gender'] # 범주형 변수
col_y = ['body_mass_g']  # 무게

In [39]:
# 참고
x = pd.concat([x_train, x_test])

from sklearn.preprocessing import OneHotEncoder
# 문자열(명목형) -> 정수로 변환 -> One-Hot-encodng로 변환
# sparse=False: 2차원 list, True: Matrix는 권장하지 않음.
ohe = OneHotEncoder(sparse=False, handle_unknown = 'ignore') 
ohe.fit(x[col_cat]) # One-Hot-encodng로 변환 초기화

# handle_unknown = 'ignore': 초기화후 새로운 카테고리가 등장하면 0으로 지정
x_train_ohe = ohe.transform(x_train[col_cat]) 
x_test_ohe = ohe.transform(x_test[col_cat])
print(x_train_ohe) # 변수 3개가 결합되서 출력
print(x_train_ohe[0])

[[0. 0. 1. ... 0. 1. 0.]
 [1. 0. 0. ... 1. 0. 1.]
 [0. 1. 0. ... 0. 0. 1.]
 ...
 [1. 0. 0. ... 1. 1. 0.]
 [0. 1. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 0.]]
[0. 0. 1. 1. 0. 0. 1. 0.]


### 훈련 데이터의 ont-hot-encoding

In [40]:
from sklearn.preprocessing import OneHotEncoder
# # 문자열(명목형) -> 정수로 변환 -> One-Hot-encodng로 변환
# sparse=False: 2차원 list, True: Matrix는 권장하지 않음.
ohe = OneHotEncoder(sparse=False, handle_unknown = 'ignore') 

In [41]:
ohe.fit(x_train[['species']]) # One-Hot-encodng로 변환 초기화
species_ohe = ohe.transform(x_train[['species']]) 
print(species_ohe[0:3])
species_ohe_df = pd.DataFrame(species_ohe, columns=['species0','species1','species2'])
print(species_ohe_df)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]
     species0  species1  species2
0         0.0       0.0       1.0
1         1.0       0.0       0.0
2         0.0       1.0       0.0
3         0.0       0.0       1.0
4         1.0       0.0       0.0
..        ...       ...       ...
227       0.0       1.0       0.0
228       0.0       0.0       1.0
229       1.0       0.0       0.0
230       0.0       1.0       0.0
231       0.0       0.0       1.0

[232 rows x 3 columns]


In [42]:
ohe.fit(x_train[['island']]) # One-Hot-encodng로 변환 초기화
island_ohe = ohe.transform(x_train[['island']]) 
print(island_ohe[0:3])
island_ohe_df = pd.DataFrame(island_ohe, columns=['island0','island1','island2'])
print(island_ohe_df)

[[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
     island0  island1  island2
0        1.0      0.0      0.0
1        0.0      0.0      1.0
2        0.0      1.0      0.0
3        1.0      0.0      0.0
4        0.0      1.0      0.0
..       ...      ...      ...
227      0.0      1.0      0.0
228      1.0      0.0      0.0
229      0.0      0.0      1.0
230      0.0      1.0      0.0
231      1.0      0.0      0.0

[232 rows x 3 columns]


In [43]:
ohe.fit(x_train[['gender']]) # One-Hot-encodng로 변환 초기화
gender_ohe = ohe.transform(x_train[['gender']]) 
print(gender_ohe[0:3])
gender_ohe_df = pd.DataFrame(gender_ohe, columns=['gender0','gender1'])
print(gender_ohe_df)

[[1. 0.]
 [0. 1.]
 [0. 1.]]
     gender0  gender1
0        1.0      0.0
1        0.0      1.0
2        0.0      1.0
3        0.0      1.0
4        1.0      0.0
..       ...      ...
227      1.0      0.0
228      0.0      1.0
229      1.0      0.0
230      1.0      0.0
231      1.0      0.0

[232 rows x 2 columns]


In [44]:
# 수치형으로 변경된 변수는 제거후 DataFrame 병합
x_train_sub = x_train.iloc[:, 3:]
x_train_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bill_length_mm     232 non-null    float64
 1   bill_depth_mm      232 non-null    float64
 2   flipper_length_mm  232 non-null    float64
dtypes: float64(3)
memory usage: 5.6 KB


In [45]:
x_train_total = pd.concat([x_train_sub, species_ohe_df, island_ohe_df, gender_ohe_df], axis=1)
x_train_total.head(3)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,species0,species1,species2,island0,island1,island2,gender0,gender1
0,43.5,15.2,213.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,42.8,18.5,195.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,53.5,19.9,205.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


### 테스트 데이터의 ont-hot-encoding

In [46]:
ohe.fit(x_test[['species']]) # One-Hot-encodng로 변환 초기화
species_ohe_df = pd.DataFrame(ohe.transform(x_test[['species']]), columns=['species0','species1','species2'])
print(species_ohe_df[0:3])

   species0  species1  species2
0       1.0       0.0       0.0
1       0.0       0.0       1.0
2       1.0       0.0       0.0


In [47]:
ohe.fit(x_test[['island']]) # One-Hot-encodng로 변환 초기화
island_ohe_df = pd.DataFrame(ohe.transform(x_test[['island']]), columns=['island0','island1','island2'])
print(island_ohe_df[0:3])

   island0  island1  island2
0      0.0      0.0      1.0
1      1.0      0.0      0.0
2      1.0      0.0      0.0


In [48]:
ohe.fit(x_test[['gender']]) # One-Hot-encodng로 변환 초기화
gender_ohe_df = pd.DataFrame(ohe.transform(x_test[['gender']]), columns=['gender0','gender1'])
print(gender_ohe_df[0:3])

   gender0  gender1
0      0.0      1.0
1      0.0      1.0
2      0.0      1.0


In [49]:
# 수치형으로 변경된 변수는 제거후 DataFrame 병합
x_test_sub = x_test.iloc[:, 3:]
x_test_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bill_length_mm     101 non-null    float64
 1   bill_depth_mm      101 non-null    float64
 2   flipper_length_mm  101 non-null    float64
dtypes: float64(3)
memory usage: 2.5 KB


In [50]:
x_test_total = pd.concat([x_test_sub, species_ohe_df, island_ohe_df, gender_ohe_df], axis=1)
x_test_total.head(3)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,species0,species1,species2,island0,island1,island2,gender0,gender1
0,42.1,19.1,195.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,45.5,15.0,220.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,40.6,18.8,193.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [51]:
x_train_total.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,species0,species1,species2,island0,island1,island2,gender0,gender1
count,232.0,232.0,232.0,232.0,232.0,232.0,232.0,232.0,232.0,232.0,232.0
mean,43.990948,17.226293,200.681034,0.431034,0.224138,0.344828,0.491379,0.37931,0.12931,0.478448,0.521552
std,5.50976,1.964677,14.064231,0.496292,0.417915,0.47634,0.501007,0.486265,0.336269,0.500615,0.500615
min,32.1,13.2,172.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.2,15.7,190.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,44.95,17.35,197.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,48.775,18.725,212.25,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
max,58.0,21.5,231.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [52]:
x_test_total.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,species0,species1,species2,island0,island1,island2,gender0,gender1
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,43.99703,17.023762,201.623762,0.455446,0.158416,0.386139,0.485149,0.346535,0.168317,0.534653,0.465347
std,5.400305,1.982228,13.951238,0.500495,0.366952,0.489291,0.502272,0.478239,0.376013,0.501285,0.501285
min,34.0,13.1,176.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.6,15.3,190.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,44.0,17.2,198.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,47.7,18.6,214.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
max,59.6,21.2,230.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
# 모델 제작
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(x_train_total, y_train, 
                                            test_size=0.3)

# 독립 변수 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_tr[col_num])
x_tr[col_num]=scaler.transform(x_tr[col_num])
x_val[col_num]=scaler.transform(x_val[col_num])
x_test_total[col_num]=scaler.transform(x_test_total[col_num])

In [54]:
from sklearn.linear_model import LinearRegression

modelLR = LinearRegression()
modelLR.fit(x_tr, y_tr)

y_val_pred = modelLR.predict(x_val) # 모델 테스트
print(y_val_pred[0:3])

[[4745.47759271]
 [3498.84622458]
 [4136.81645986]]


In [55]:
print(modelLR.intercept_)

coef = pd.Series(data=modelLR.coef_[0], index=x_train_total.columns)
print(coef.sort_values())

[3206.99019149]
species1            -447.653176
species0            -198.902345
gender0             -182.772596
island1              -79.243185
island2              -10.839662
island0               90.082848
gender1              182.772596
bill_depth_mm        438.236474
bill_length_mm       616.886875
species2             646.555521
flipper_length_mm    971.640228
dtype: float64


In [56]:
# 단순 선형회귀: f(x) = ax + b
# 다중 선형회귀: f(x) = ax1 + bx2 + c
#   f(x) = -445.915509 * species1 +
#          -230.700045 * species0 +
#          -213.211690 * gender0 +
#          -48.685922 * island1 +
#          0.210371 * island2 +
#          48.475551 * island0 +
#          213.211690 * gender1 +
#          428.730143 * bill_depth_mm +
#          676.615554 * species2 +
#          704.606124 * bill_length_mm +
#          787.978665 * flipper_length_mm + 3284.52867118

In [57]:
# 회귀 모델 평가
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_val, y_val_pred)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)

print('MSE : {0:.3f} , RMSE : {1:.3F} '.format(mse , rmse))

MSE : 96564.075 , RMSE : 310.748 


In [58]:
# 테스트 데이터 실행
y_pred = modelLR.predict(x_test_total)
print(y_pred[0:3])

[[4108.49353846]
 [5331.08890535]
 [4124.91217979]]


In [60]:
# 소수 첫째자리 까지 반올림 출력
np.set_printoptions(precision=1)
print(np.round(y_pred, 1)[0:3])

[[4108.5]
 [5331.1]
 [4124.9]]


In [61]:
# 소수 짤림
np.set_printoptions(precision=1)
print(np.floor(y_pred)[0:3])

[[4108.]
 [5331.]
 [4124.]]


In [62]:
pd.DataFrame({'body_mass_g': y_pred[:,0]}).to_csv('./send/002.csv', index=False)