In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor

In [2]:
test_df = pd.read_csv('./data/test.csv')
train_df = pd.read_csv('./data/train.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
train_df = train_df.drop(columns='id') # id값 제거
test_df = test_df.drop(columns='id')
train_df.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


## Label Encoding

In [4]:
def label_encode(df):
    for i in df.columns:
        if df[i].dtypes == "object":
            label_map = {}
            for j,k in enumerate(df[i].unique()):
                label_map[k] = j+1
            df[i] = df[i].replace(to_replace=label_map)

In [5]:
train_df_label = train_df.copy() # label 인코딩
label_encode(train_df_label)
test_df_label = test_df.copy()
label_encode(test_df_label)

train_df_label.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [6]:
train_label_x = train_df_label.drop(columns='Target') # Target 분리
train_label_y = train_df_label['Target']
train_label_x.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31
1,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045
2,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41
3,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337
4,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04


In [7]:
model_label = RandomForestRegressor() # 모델 학습 및 적용
model_label.fit(train_label_x,train_label_y)

pred_label = model_label.predict(test_df_label)
pred_label

array([ 8.83, 12.78,  5.27, ...,  9.48, 11.15, 11.17])

In [8]:
submission['Target'] = pred_label # 제출 파일 저장
submission.to_csv('submit_label.csv', index=False)
submission.head()

Unnamed: 0,id,Target
0,1,8.83
1,2,12.78
2,3,5.27
3,4,11.87
4,5,10.22


## OneHot Encoding

In [9]:
train_df_onehot = pd.get_dummies(train_df, columns = ['Gender']) # onehot 인코딩
test_df_onehot = pd.get_dummies(test_df, columns = ['Gender'])
train_df_onehot.head()

Unnamed: 0,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target,Gender_F,Gender_I,Gender_M
0,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15,0,0,1
1,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8,0,1,0
2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18,0,1,0
3,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13,0,0,1
4,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6,0,1,0


In [10]:
train_onehot_x = train_df_onehot.drop(columns='Target') # Target 분리
train_onehot_y = train_df_onehot['Target']
train_onehot_x.head()

Unnamed: 0,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Gender_F,Gender_I,Gender_M
0,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,0,0,1
1,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,0,1,0
2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,0,1,0
3,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,0,0,1
4,0.31,0.235,0.09,0.127,0.048,0.031,0.04,0,1,0


In [11]:
model_onehot = RandomForestRegressor() # 모델 학습 및 적용
model_onehot.fit(train_onehot_x,train_onehot_y)

pred_onehot = model_onehot.predict(test_df_onehot)
pred_onehot

array([ 8.47, 12.87,  5.22, ...,  9.58, 10.07, 11.08])

In [12]:
submission['Target'] = pred_onehot # 제출 파일 저장
submission.to_csv('submit_onehot.csv', index=False)
submission.head()

Unnamed: 0,id,Target
0,1,8.47
1,2,12.87
2,3,5.22
3,4,11.79
4,5,10.42
