In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor

In [2]:
test_df = pd.read_csv('./data/test.csv')
train_df = pd.read_csv('./data/train.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
def check_null(df):  # 결측치 확인
    total_cnt = 0 
    for i in df.columns:
        cnt = sum(df[i].isnull())
        total_cnt += cnt
        if cnt:
            print(f'{i}열에 결측치{cnt}개')
        else:
            print(f'{i}열에 결측치 없음')
    if total_cnt == 0:
        print('모든열에 결측치 없음')

In [4]:
check_null(test_df)

id열에 결측치 없음
Gender열에 결측치 없음
Lenght열에 결측치 없음
Diameter열에 결측치 없음
Height열에 결측치 없음
Whole Weight열에 결측치 없음
Shucked Weight열에 결측치 없음
Viscra Weight열에 결측치 없음
Shell Weight열에 결측치 없음
모든열에 결측치 없음


In [5]:
check_null(train_df)

id열에 결측치 없음
Gender열에 결측치 없음
Lenght열에 결측치 없음
Diameter열에 결측치 없음
Height열에 결측치 없음
Whole Weight열에 결측치 없음
Shucked Weight열에 결측치 없음
Viscra Weight열에 결측치 없음
Shell Weight열에 결측치 없음
Target열에 결측치 없음
모든열에 결측치 없음


In [6]:
train_df = train_df.drop(columns='id') # id값 제거
test_df = test_df.drop(columns='id')
train_df.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


## Label Encoding

In [7]:
def label_encode(df):
    for i in df.columns:
        if df[i].dtypes == "object":
            label_map = {}
            for j,k in enumerate(df[i].unique()):
                label_map[k] = j+1
            df[i] = df[i].replace(to_replace=label_map)

In [8]:
train_df_label = train_df.copy() # label 인코딩
label_encode(train_df_label)
test_df_label = test_df.copy()
label_encode(test_df_label)

train_df_label.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [9]:
train_label_x = train_df_label.drop(columns='Target') # Target 분리
train_label_y = train_df_label['Target']
train_label_x.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31
1,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045
2,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41
3,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337
4,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04


In [10]:
model_label = RandomForestRegressor() # 모델 학습 및 적용
model_label.fit(train_label_x,train_label_y)

pred_label = model_label.predict(test_df_label)
pred_label

array([ 8.54, 13.13,  5.13, ...,  9.62, 10.68, 10.76])

In [11]:
submission['Target'] = pred_label # 제출 파일 저장
submission.to_csv('submit_label.csv', index=False)
submission.head()

Unnamed: 0,id,Target
0,1,8.54
1,2,13.13
2,3,5.13
3,4,11.09
4,5,10.07


## OneHot Encoding

## Label Encoding

In [7]:
def label_encode(df):
    for i in df.columns:
        if df[i].dtypes == "object":
            label_map = {}
            for j,k in enumerate(df[i].unique()):
                label_map[k] = j+1
            df[i] = df[i].replace(to_replace=label_map)

In [8]:
train_df_label = train_df.copy() # label 인코딩
label_encode(train_df_label)
test_df_label = test_df.copy()
label_encode(test_df_label)

train_df_label.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [9]:
train_label_x = train_df_label.drop(columns='Target') # Target 분리
train_label_y = train_df_label['Target']
train_label_x.head()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight
0,1,0.605,0.47,0.115,1.114,0.3925,0.291,0.31
1,2,0.43,0.315,0.095,0.378,0.175,0.08,0.1045
2,2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41
3,1,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337
4,2,0.31,0.235,0.09,0.127,0.048,0.031,0.04


In [10]:
model_label = RandomForestRegressor() # 모델 학습 및 적용
model_label.fit(train_label_x,train_label_y)

pred_label = model_label.predict(test_df_label)
pred_label

array([ 8.54, 13.13,  5.13, ...,  9.62, 10.68, 10.76])

In [11]:
submission['Target'] = pred_label # 제출 파일 저장
submission.to_csv('submit_label.csv', index=False)
submission.head()

Unnamed: 0,id,Target
0,1,8.54
1,2,13.13
2,3,5.13
3,4,11.09
4,5,10.07


## OneHot Encoding

In [12]:
train_df_onehot = pd.get_dummies(train_df, columns = ['Gender']) # onehot 인코딩
test_df_onehot = pd.get_dummies(test_df, columns = ['Gender'])
train_df_onehot.head()

Unnamed: 0,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target,Gender_F,Gender_I,Gender_M
0,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15,0,0,1
1,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8,0,1,0
2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18,0,1,0
3,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13,0,0,1
4,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6,0,1,0


In [13]:
train_onehot_x = train_df_onehot.drop(columns='Target') # Target 분리
train_onehot_y = train_df_onehot['Target']
train_onehot_x.head()

Unnamed: 0,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Gender_F,Gender_I,Gender_M
0,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,0,0,1
1,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,0,1,0
2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,0,1,0
3,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,0,0,1
4,0.31,0.235,0.09,0.127,0.048,0.031,0.04,0,1,0


In [14]:
model_onehot = RandomForestRegressor() # 모델 학습 및 적용
model_onehot.fit(train_onehot_x,train_onehot_y)

pred_onehot = model_onehot.predict(test_df_onehot)
pred_onehot

array([ 8.42, 13.04,  5.24, ...,  9.64,  9.86, 10.74])

In [15]:
submission['Target'] = pred_onehot # 제출 파일 저장
submission.to_csv('submit_onehot.csv', index=False)
submission.head()

Unnamed: 0,id,Target
0,1,8.42
1,2,13.04
2,3,5.24
3,4,11.88
4,5,10.23


## Train 데이터로  nmae 분석하기

In [16]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [36]:
# Label Data

# Train Data로 train, test 분리
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(train_label_x, train_label_y, test_size=0.2, random_state=123)

new_model_label = RandomForestRegressor() # 모델 학습 및 적용
new_model_label.fit(new_x_train,new_y_train)

pred_label = new_model_label.predict(new_x_test)

print(NMAE(new_y_test,pred_label))
print('pred값 반올림:',NMAE(new_y_test,np.rint(pred_label)))

0.16244624746450304
pred값 반올림: 0.1594320486815416


In [37]:
# Onehot Data

# Onehot Train Data로 train, test 분리
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(train_onehot_x, train_onehot_y, test_size=0.2, random_state=123)

new_model_onehot = RandomForestRegressor() # 모델 학습 및 적용
new_model_onehot.fit(new_x_train,new_y_train)

pred_onehot = new_model_onehot.predict(new_x_test)

print(NMAE(new_y_test,pred_onehot))
print('pred값 반올림:',NMAE(new_y_test,np.rint(pred_onehot)))

0.16150507099391487
pred값 반올림: 0.15578093306288032


In [12]:
train_df_onehot = pd.get_dummies(train_df, columns = ['Gender']) # onehot 인코딩
test_df_onehot = pd.get_dummies(test_df, columns = ['Gender'])
train_df_onehot.head()

Unnamed: 0,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target,Gender_F,Gender_I,Gender_M
0,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15,0,0,1
1,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8,0,1,0
2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18,0,1,0
3,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13,0,0,1
4,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6,0,1,0


In [13]:
train_onehot_x = train_df_onehot.drop(columns='Target') # Target 분리
train_onehot_y = train_df_onehot['Target']
train_onehot_x.head()

Unnamed: 0,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Gender_F,Gender_I,Gender_M
0,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,0,0,1
1,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,0,1,0
2,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,0,1,0
3,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,0,0,1
4,0.31,0.235,0.09,0.127,0.048,0.031,0.04,0,1,0


In [14]:
model_onehot = RandomForestRegressor() # 모델 학습 및 적용
model_onehot.fit(train_onehot_x,train_onehot_y)

pred_onehot = model_onehot.predict(test_df_onehot)
pred_onehot

array([ 8.42, 13.04,  5.24, ...,  9.64,  9.86, 10.74])

In [15]:
submission['Target'] = pred_onehot # 제출 파일 저장
submission.to_csv('submit_onehot.csv', index=False)
submission.head()

Unnamed: 0,id,Target
0,1,8.42
1,2,13.04
2,3,5.24
3,4,11.88
4,5,10.23


## Train 데이터로  nmae 분석하기

In [16]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [36]:
# Label Data

# Train Data로 train, test 분리
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(train_label_x, train_label_y, test_size=0.2, random_state=123)

new_model_label = RandomForestRegressor() # 모델 학습 및 적용
new_model_label.fit(new_x_train,new_y_train)

pred_label = new_model_label.predict(new_x_test)

print(NMAE(new_y_test,pred_label))
print('pred값 반올림:',NMAE(new_y_test,np.rint(pred_label)))

0.16244624746450304
pred값 반올림: 0.1594320486815416


In [37]:
# Onehot Data

# Onehot Train Data로 train, test 분리
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(train_onehot_x, train_onehot_y, test_size=0.2, random_state=123)

new_model_onehot = RandomForestRegressor() # 모델 학습 및 적용
new_model_onehot.fit(new_x_train,new_y_train)

pred_onehot = new_model_onehot.predict(new_x_test)

print(NMAE(new_y_test,pred_onehot))
print('pred값 반올림:',NMAE(new_y_test,np.rint(pred_onehot)))

0.16150507099391487
pred값 반올림: 0.15578093306288032


In [None]:
# from sklearn.model_selection import train_test_split

# new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(train_X_resampled, train_y_resampled, test_size=0.2, random_state=123)
# new_model_label = RandomForestRegressor()
# new_model_label.fit(new_x_train, new_y_train)

# pred_label = new_model_label.predict(new_x_test)

# print(NMAE(new_y_test, pred_label))