In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# 1. 데이터 확인

In [4]:
train.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [5]:
test.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,F,0.595,0.47,0.155,1.121,0.4515,0.178,0.155
1,2,M,0.58,0.45,0.15,0.927,0.276,0.1815,0.36
2,3,I,0.26,0.205,0.07,0.097,0.0415,0.019,0.0305
3,4,M,0.59,0.46,0.13,1.102,0.455,0.2055,0.33
4,5,F,0.595,0.465,0.14,1.113,0.5175,0.244,0.305


- id : 샘플 아이디
- Gender : 전복 성별
- Lenght : 전복 길이
- Diameter : 전복 둘레
- Height : 전복 키 
- Whole : Weight : 전복 전체 무게
- Shucked Weight : 껍질을 제외한 무게
- Viscra Weight : 내장 무게
- Shell Weight : 껍질 무게
- Target : 전복 나이

In [6]:
train.rename(columns={'Lenght':'Length'}, inplace = True)
test.rename(columns={'Lenght':'Length'}, inplace = True)
train.head()

Unnamed: 0,id,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [7]:
print('train shape:', train.shape)
print('test shape:', test.shape)

train shape: (1253, 10)
test shape: (2924, 9)


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1253 non-null   int64  
 1   Gender          1253 non-null   object 
 2   Length          1253 non-null   float64
 3   Diameter        1253 non-null   float64
 4   Height          1253 non-null   float64
 5   Whole Weight    1253 non-null   float64
 6   Shucked Weight  1253 non-null   float64
 7   Viscra Weight   1253 non-null   float64
 8   Shell Weight    1253 non-null   float64
 9   Target          1253 non-null   int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 98.0+ KB


In [9]:
# 성별 유니크 값
train['Gender'].unique()

array(['M', 'I', 'F'], dtype=object)

- M: 수컷
- I: 유충
- F: 암컷

# 2. 데이터 전처리
## 2-1. 이상치 제거

In [10]:
# '전복의 전체무게 < 전복의 무게 + 껍질의 무게' 인 경우 확인
a = train['Whole Weight'] < train['Shucked Weight'] + train['Shell Weight']
train[a]

Unnamed: 0,id,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
47,48,I,0.38,0.275,0.095,0.1375,0.086,0.0585,0.0605,7
382,383,I,0.455,0.33,0.1,0.372,0.358,0.0775,0.11,8
435,436,I,0.475,0.365,0.1,0.1315,0.2025,0.0875,0.123,7
847,848,I,0.23,0.165,0.06,0.0515,0.019,0.0145,0.036,4
1078,1079,I,0.275,0.205,0.07,0.1055,0.495,0.019,0.0315,5


In [11]:
# 삭제
train = train.drop(index=[47, 382, 435, 847, 1078], axis=0)

In [12]:
# '전복의 무게 < 전복의 내장 무게' 인 경우 확인
train[train['Shucked Weight'] < train['Viscra Weight']]

Unnamed: 0,id,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
465,466,M,0.415,0.315,0.125,0.388,0.068,0.09,0.125,12


In [13]:
# 삭제
train = train.drop(index=[465], axis=0)

In [14]:
# 길이보다 지름이 더 큰 경우
train[train['Length'] < train['Diameter']]

Unnamed: 0,id,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target


## 2-2. 새로운 features

In [15]:
# '전체 무게 - 전복의 무게 - 껍질의 무게' 는 전복이 머금고 있는 물의 양
train['Water'] = train['Whole Weight'] - train['Shucked Weight'] - train['Shell Weight']

In [16]:
# '전체 무게 - 물의 양'을 해줘 전복이 랜덤하게 머금던 물의 양에 영향을 받지 않도록 함
train['Whole Weight'] = train['Whole Weight'] - train['Water']

In [17]:
# '물의 양' 컬럼 삭제
train = train.drop(columns = ['Water'], axis=1)

In [18]:
# test 셋에도 동일하게 적용
test['Water'] = test['Whole Weight'] - test['Shucked Weight'] - test['Shell Weight']
test['Whole Weight'] = test['Whole Weight'] - test['Water']
test = test.drop(columns = ['Water'], axis=1)

# 3. 모델링
이 부분은 전혀 감이 안와서 '지우개'님의 코드를 필사했습니다.

In [19]:
train.reset_index(inplace=True, drop=True)

In [20]:
train

Unnamed: 0,id,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.470,0.115,0.7025,0.3925,0.2910,0.3100,15
1,2,I,0.430,0.315,0.095,0.2795,0.1750,0.0800,0.1045,8
2,3,I,0.580,0.490,0.195,0.9405,0.5305,0.2540,0.4100,18
3,4,M,0.535,0.405,0.175,0.8850,0.5480,0.3265,0.3370,13
4,5,I,0.310,0.235,0.090,0.0880,0.0480,0.0310,0.0400,6
...,...,...,...,...,...,...,...,...,...,...
1242,1249,I,0.190,0.145,0.040,0.0315,0.0165,0.0065,0.0150,4
1243,1250,I,0.395,0.310,0.085,0.2465,0.1530,0.0505,0.0935,7
1244,1251,F,0.525,0.410,0.115,0.5960,0.4160,0.1630,0.1800,7
1245,1252,F,0.445,0.335,0.110,0.3220,0.2025,0.1095,0.1195,6


In [21]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(train['Gender']) # 라벨 인코딩 실시
gender_encoded_train = encoder.transform(train['Gender'])
gender_encoded_test = encoder.transform(test['Gender'])
train['Gender'] = gender_encoded_train
test['Gender'] = gender_encoded_test

X = train.drop('Target', axis=1)
y = train['Target']

In [22]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import lightgbm

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
skf.split(X, X['Gender'])

pred_list = []
mae_list = []

for fold,(train_index, val_index) in enumerate(skf.split(X, X['Gender'])):
    
    print(f'**********{fold+1}th fold start**********')
    x_train, x_val, y_train, y_val = X.loc[train_index], X.loc[val_index], y.loc[train_index], y.loc[val_index]
    
    lgb = lightgbm.LGBMRegressor(boosting_type='gbdt', n_estimators=1000,
                                 random_state=2022, learning_rate=0.05, categorical_feature=[0])
    
    lgb.fit(x_train, y_train, eval_set=(x_val, y_val),
           eval_metric='l1', verbose=False, early_stopping_rounds=100)
    
    pred = lgb.predict(x_val)
    result = mean_absolute_error(pred, y_val)
    mae_list.append(result)
    
    pred_test = lgb.predict(test)
    pred_list.append(pred_test)
    
    print(f'mae: {result:.4f}', end='\n\n')
    
print(f'mean mae {np.mean(mae_list):.4f}')