# 1. Import

In [1]:
# 연산 처리를 위한 패키지
import numpy as np
import pandas as pd
from pandas import DataFrame

# 데이터 분석을 위한 패키지
import statsmodels.api as sm

# 시각화를 위한 패키지
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# 필요모듈 import
import os
import openpyxl
from datetime import datetime
from tqdm import tqdm

# 그래프를 실제로 그리기 위한 설정
%matplotlib inline

# 머신러닝 패키지
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error
import random
import optuna
from optuna.samplers import TPESampler

# 폰트 처리
# plt.rc('font', family='NanumGothic')        # for windows
plt.rc('font', family='AppleGothic') # For MacOS

import warnings
warnings.filterwarnings('ignore')

# 2.1. 원본 데이터 로딩

In [2]:
'''
path = "data"
test_path = "/test.csv"
train_path = "/train.csv"
sample_submission_path = "/sample_submission.csv"

test = pd.read_csv(path + test_path)
train = pd.read_csv(path + train_path)

# 격자공간고유정보 다운로드 https://www.bigdata-region.kr/#/dataset/0ad3c882-f7ee-4faf-970d-00c53cb65a84

# 격자공간고유번호 파일 병합 과정

import pandas as pd
import glob
import os

input_file = r'TC_NU_SPG_50_METER'
output_file = r'TC_NU_SPG_50_METER/geo_data.csv'

allFile_list = glob.glob(os.path.join(input_file, 'TC_*')) 

print(allFile_list)

allData = []

for file in allFile_list:
    df = pd.read_csv(file) # for구문으로 csv파일들을 읽어 들인다
    allData.append(df) # 빈 리스트에 읽어 들인 내용을 추가한다

dataCombine = pd.concat(allData, axis=0, ignore_index=True) # concat함수를 이용해서 리스트의 내용을 병합
# axis=0은 수직으로 병합함. axis=1은 수평. ignore_index=True는 인데스 값이 기존 순서를 무시하고 순서대로 정렬되도록 한다.
dataCombine.to_csv(output_file, index=False) # to_csv함수로 저장한다. 인덱스를 빼려면 False로 설정

geo_path = r"TC_NU_SPG_50_METER/geo_data.csv"

geo = pd.read_csv(geo_path)
'''

['TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_29.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_28.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_11.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_48.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_43.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_42.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_41.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_45.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_50.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_44.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_46.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_47.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_36.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_26.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_27.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_31.csv', 'TC_NU_SPG_50_METER/TC_NU_SPG_50_METER_30.csv']


# 2.2. 데이터정보 추가, 파일 생성
#### final_test.csv와 final_train.csv 파일을 만드는 과정입니다. 시간이 오래걸리기 때문에 이미 파일이 있다면 생략하는게 좋습니다.

In [3]:
'''
# 송하인_격자공간고유번호, 수하인_격자공간고유번호를 geo와 매칭해서 격자공간명, 시군구코드, 시군구명 추가

test = pd.merge(test, geo, left_on=['송하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')
test.rename(columns= {'격자공간명':'송하인_격자공간명', '시군구코드':'송하인_시군구코드', '시군구명':'송하인_시군구명'}, inplace=True)
test = pd.merge(test, geo, left_on=['수하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')
test.rename(columns= {'격자공간명':'수하인_격자공간명', '시군구코드':'수하인_시군구코드', '시군구명':'수하인_시군구명'}, inplace=True)

train = pd.merge(train, geo, left_on=['송하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')
train.rename(columns= {'격자공간명':'송하인_격자공간명', '시군구코드':'송하인_시군구코드', '시군구명':'송하인_시군구명'}, inplace=True)
train = pd.merge(train, geo, left_on=['수하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')
train.rename(columns= {'격자공간명':'수하인_격자공간명', '시군구코드':'수하인_시군구코드', '시군구명':'수하인_시군구명'}, inplace=True)


# 이상하게 들어간 내용들 제거

test.drop(['격자공간고유번호_x', '격자공간고유번호_y'], axis = 1, inplace = True)
test = test[['index', '송하인_격자공간고유번호', '송하인_격자공간명',
       '송하인_시군구코드', '송하인_시군구명', '수하인_격자공간고유번호', '수하인_격자공간명', '수하인_시군구코드', '수하인_시군구명', '물품_카테고리']]

train.drop(['격자공간고유번호_x', '격자공간고유번호_y'], axis = 1, inplace = True)
train = train[['index', '송하인_격자공간고유번호', '송하인_격자공간명',
       '송하인_시군구코드', '송하인_시군구명', '수하인_격자공간고유번호', '수하인_격자공간명', '수하인_시군구코드', '수하인_시군구명', '물품_카테고리', '운송장_건수']]

test['송하인_시군구코드'] = test['송하인_시군구코드'].apply(str)
test['수하인_시군구코드'] = test['수하인_시군구코드'].apply(str)
train['송하인_시군구코드'] = train['송하인_시군구코드'].apply(str)
train['수하인_시군구코드'] = train['수하인_시군구코드'].apply(str)

test['송하인_시도코드'] = test['송하인_시군구코드'].str.slice(0,2)
test['수하인_시도코드'] = test['수하인_시군구코드'].str.slice(0,2)
train['송하인_시도코드'] = train['송하인_시군구코드'].str.slice(0,2)
train['수하인_시도코드'] = train['수하인_시군구코드'].str.slice(0,2)

do_dictionary = {
  '11' : '서울특별시',
  '26' : '부산광역시',
  '27' : '대구광역시',
  '28' : '인천광역시',
  '29' : '광주광역시',
  '30' : '대전광역시',
  '31' : '울산광역시',
  '36' : '세종특별자치시',
  '41' : '경기도',
  '42' : '강원도',
  '43' : '충청북도',
  '44' : '충청남도', 
  '45' : '전라북도',
  '46' : '전라남도',
  '47' : '경상북도', 
  '48' : '경상남도',
  '50' : '제주특별자치도'
  }                               # 행정표준코드관리시스템 https://www.code.go.kr/stdcode/regCodeL.do

train['송하인_시도명'] = train['송하인_시도코드'].map(do_dictionary)
train['수하인_시도명'] = train['수하인_시도코드'].map(do_dictionary)
test['송하인_시도명'] = test['송하인_시도코드'].map(do_dictionary)
test['수하인_시도명'] = test['수하인_시도코드'].map(do_dictionary)

test = test[['index', '송하인_격자공간고유번호', '송하인_격자공간명',
       '송하인_시도코드', '송하인_시도명','송하인_시군구코드', '송하인_시군구명', '수하인_격자공간고유번호', '수하인_격자공간명', '수하인_시도코드', '수하인_시도명', '수하인_시군구코드', '수하인_시군구명', '물품_카테고리']]

train = train[['index', '송하인_격자공간고유번호', '송하인_격자공간명',
       '송하인_시도코드', '송하인_시도명','송하인_시군구코드', '송하인_시군구명', '수하인_격자공간고유번호', '수하인_격자공간명', '수하인_시도코드', '수하인_시도명', '수하인_시군구코드', '수하인_시군구명', '물품_카테고리', '운송장_건수']]

test.to_csv('data/final_test.csv')          # 최종본 저장
train.to_csv('data/final_train.csv')        
'''

"\n# 송하인_격자공간고유번호, 수하인_격자공간고유번호를 geo와 매칭해서 격자공간명, 시군구코드, 시군구명 추가\n\ntest = pd.merge(test, geo, left_on=['송하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')\ntest.rename(columns= {'격자공간명':'송하인_격자공간명', '시군구코드':'송하인_시군구코드', '시군구명':'송하인_시군구명'}, inplace=True)\ntest = pd.merge(test, geo, left_on=['수하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')\ntest.rename(columns= {'격자공간명':'수하인_격자공간명', '시군구코드':'수하인_시군구코드', '시군구명':'수하인_시군구명'}, inplace=True)\n\ntrain = pd.merge(train, geo, left_on=['송하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')\ntrain.rename(columns= {'격자공간명':'송하인_격자공간명', '시군구코드':'송하인_시군구코드', '시군구명':'송하인_시군구명'}, inplace=True)\ntrain = pd.merge(train, geo, left_on=['수하인_격자공간고유번호'], right_on = ['격자공간고유번호'], how='left')\ntrain.rename(columns= {'격자공간명':'수하인_격자공간명', '시군구코드':'수하인_시군구코드', '시군구명':'수하인_시군구명'}, inplace=True)\n\n\n# 이상하게 들어간 내용들 제거\n\ntest.drop(['격자공간고유번호_x', '격자공간고유번호_y'], axis = 1, inplace = True)\ntest = test[['index', '송하인_격자공간고유번호', '송하인_격자공간명',\n       '송하인_시군구코드', 

# 3. EDA

In [4]:
# 뉴 데이터 로딩
test = pd.read_csv('data/final_test.csv', index_col = 0)             # unnamed_0 이라는 index가 추가되어 나오지 않게 'index_col = 0' 을 추가했습니다.
train = pd.read_csv('data/final_train.csv', index_col = 0)

submission = pd.read_csv('data/sample_submission.csv')

train2 = train.copy()
train.head()

Unnamed: 0,index,송하인_격자공간고유번호,송하인_격자공간명,송하인_시도코드,송하인_시도명,송하인_시군구코드,송하인_시군구명,수하인_격자공간고유번호,수하인_격자공간명,수하인_시도코드,수하인_시도명,수하인_시군구코드,수하인_시군구명,물품_카테고리,운송장_건수
0,0,5011000595017300,다나1395,50,제주특별자치도,50110,제주시,2871000192069300,다사1072,28,인천광역시,28710,강화군,음반,3
1,1,4148000690043300,다사2868,41,경기도,41480,파주시,5011000264024400,다다0901,50,제주특별자치도,50110,제주시,문화컨텐츠,3
2,2,5011000078068400,다다3007,50,제주특별자치도,50110,제주시,1120000007005400,다사5950,11,서울특별시,11200,성동구,농산물,3
3,3,4127100048006400,다사4521,41,경기도,41271,안산시상록구,5011000587019400,다나0595,50,제주특별자치도,50110,제주시,기타식품,7
4,4,5011000078068400,다다3007,50,제주특별자치도,50110,제주시,2823700010076300,다사3145,28,인천광역시,28237,부평구,농산물,3


In [5]:
def setting_data(train, test, one, two, three, four, five, six):
  test['송하인_격자공간고유번호'] = test['송하인_격자공간고유번호'].astype(str)
  test['수하인_격자공간고유번호'] = test['수하인_격자공간고유번호'].astype(str)
  train['송하인_격자공간고유번호'] = train['송하인_격자공간고유번호'].astype(str)
  train['수하인_격자공간고유번호'] = train['수하인_격자공간고유번호'].astype(str)

  train['송하인_코드1'] = train['송하인_격자공간고유번호'].str.slice(int(one),int(two))
  train['송하인_코드2'] = train['송하인_격자공간고유번호'].str.slice(int(two),int(three))
  train['송하인_코드3'] = train['송하인_격자공간고유번호'].str.slice(int(three),int(four))
  train['송하인_코드4'] = train['송하인_격자공간고유번호'].str.slice(int(four),int(five))
  train['송하인_코드5'] = train['송하인_격자공간고유번호'].str.slice(int(five),int(six))

  train['수하인_코드1'] = train['수하인_격자공간고유번호'].str.slice(int(one),int(two))
  train['수하인_코드2'] = train['수하인_격자공간고유번호'].str.slice(int(two),int(three))
  train['수하인_코드3'] = train['수하인_격자공간고유번호'].str.slice(int(three),int(four))
  train['수하인_코드4'] = train['수하인_격자공간고유번호'].str.slice(int(four),int(five))
  train['수하인_코드5'] = train['수하인_격자공간고유번호'].str.slice(int(five),int(six))

  test['송하인_코드1'] = test['송하인_격자공간고유번호'].str.slice(int(one),int(two))
  test['송하인_코드2'] = test['송하인_격자공간고유번호'].str.slice(int(two),int(three))
  test['송하인_코드3'] = test['송하인_격자공간고유번호'].str.slice(int(three),int(four))
  test['송하인_코드4'] = test['송하인_격자공간고유번호'].str.slice(int(four),int(five))
  test['송하인_코드5'] = test['송하인_격자공간고유번호'].str.slice(int(five),int(six))

  test['수하인_코드1'] = test['수하인_격자공간고유번호'].str.slice(int(one),int(two))
  test['수하인_코드2'] = test['수하인_격자공간고유번호'].str.slice(int(two),int(three))
  test['수하인_코드3'] = test['수하인_격자공간고유번호'].str.slice(int(three),int(four))
  test['수하인_코드4'] = test['수하인_격자공간고유번호'].str.slice(int(four),int(five))
  test['수하인_코드5'] = test['수하인_격자공간고유번호'].str.slice(int(five),int(six))

  train = train[['index', 
                  '송하인_코드1','송하인_시도명',
                  '송하인_코드2', '송하인_시군구명',
                  '송하인_코드3', '송하인_코드4', '송하인_코드5',
                  
                  '수하인_코드1', '수하인_시도명',
                  '수하인_코드2', '수하인_시군구명',
                  '수하인_코드3', '수하인_코드4', '수하인_코드5',
                  '물품_카테고리', 
                  '운송장_건수']]

  test = test[['index',
                '송하인_코드1','송하인_시도명',
                '송하인_코드2', '송하인_시군구명', 
                '송하인_코드3', '송하인_코드4', '송하인_코드5',

                '수하인_코드1', '수하인_시도명',
                '수하인_코드2', '수하인_시군구명',
                '수하인_코드3', '수하인_코드4', '수하인_코드5',
                '물품_카테고리',
                ]]
                
  return train, test


In [6]:
def confirm_nun(train):
  print('송하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.')
  print('코드_1 :', train['송하인_코드1'].nunique())
  print('코드_2 :', train['송하인_코드2'].nunique())
  print('코드_3 :', train['송하인_코드3'].nunique())
  print('코드_4 :', train['송하인_코드4'].nunique())
  print('코드_5 :', train['송하인_코드5'].nunique())

  print("*"*50)

  print('수하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.')
  print('코드_1 :', train['수하인_코드1'].nunique())
  print('코드_2 :', train['수하인_코드2'].nunique())
  print('코드_3 :', train['수하인_코드3'].nunique())
  print('코드_4 :', train['수하인_코드4'].nunique())
  print('코드_5 :', train['수하인_코드5'].nunique())

  print('*'*50)
  print('수하인_코드4의 고유값은')
  print(train['수하인_코드4'].unique())


In [7]:
train1, test1 = setting_data(train, test, '0', '2', '5', '8', '10', '16')       # 0, 2, 5, 9, 10, 16번째 위치에서 코드를 잘라서 저장합니다. 이걸로 결정
train2, test2 = setting_data(train, test, '0', '2', '5', '10', '11', '16')      # 0, 2, 5, 10, 11, 16번째 위치에서 코드를 잘라서 저장합니다.

### train3은 16자리중 15,16번째 숫자들을 쳐내는 과정입니다.
### 전부 00 이라는 값을 가지기 때문에 학습할 때 제거해도 상관없다고 생각합니다.

In [8]:
train1

Unnamed: 0,index,송하인_코드1,송하인_시도명,송하인_코드2,송하인_시군구명,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_시도명,수하인_코드2,수하인_시군구명,수하인_코드3,수하인_코드4,수하인_코드5,물품_카테고리,운송장_건수
0,0,50,제주특별자치도,110,제주시,005,95,017300,28,인천광역시,710,강화군,001,92,069300,음반,3
1,1,41,경기도,480,파주시,006,90,043300,50,제주특별자치도,110,제주시,002,64,024400,문화컨텐츠,3
2,2,50,제주특별자치도,110,제주시,000,78,068400,11,서울특별시,200,성동구,000,07,005400,농산물,3
3,3,41,경기도,271,안산시상록구,000,48,006400,50,제주특별자치도,110,제주시,005,87,019400,기타식품,7
4,4,50,제주특별자치도,110,제주시,000,78,068400,28,인천광역시,237,부평구,000,10,076300,농산물,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31679,31679,44,충청남도,710,금산군,002,90,087200,50,제주특별자치도,110,제주시,002,13,073200,스포츠잡화,3
31680,31680,11,서울특별시,290,성북구,000,14,045300,50,제주특별자치도,110,제주시,003,19,087100,스마트디바이스,4
31681,31681,11,서울특별시,290,성북구,000,14,045300,50,제주특별자치도,110,제주시,002,63,065200,스마트디바이스,6
31682,31682,41,경기도,273,안산시단원구,000,65,073100,50,제주특별자치도,110,제주시,002,64,061200,지갑,7


In [9]:
confirm_nun(train1)

송하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.
코드_1 : 17
코드_2 : 99
코드_3 : 14
코드_4 : 100
코드_5 : 400
**************************************************
수하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.
코드_1 : 17
코드_2 : 101
코드_3 : 18
코드_4 : 100
코드_5 : 400
**************************************************
수하인_코드4의 고유값은
['92' '64' '07' '87' '10' '14' '69' '17' '23' '26' '16' '20' '63' '02'
 '27' '21' '96' '09' '48' '31' '03' '18' '06' '39' '24' '55' '04' '33'
 '35' '72' '56' '79' '30' '12' '59' '99' '41' '15' '62' '73' '32' '65'
 '70' '82' '74' '94' '46' '19' '78' '29' '52' '25' '85' '42' '13' '01'
 '08' '28' '05' '90' '53' '60' '66' '11' '77' '88' '00' '67' '98' '54'
 '44' '37' '36' '38' '81' '84' '58' '34' '68' '40' '57' '71' '83' '45'
 '49' '76' '47' '51' '43' '93' '61' '22' '86' '95' '50' '75' '97' '89'
 '91' '80']


In [10]:
print('*'*50)
print('수하인_코드2의 고유값은')
print(test1['수하인_코드2'].unique())

**************************************************
수하인_코드2의 고유값은
['110' '260' '130' '710' '480' '350' '465' '200' '210' '185' '173' '170'
 '570' '560' '440' '810' '470' '113' '237' '410' '390' '650' '590' '800'
 '500' '380' '900' '790' '360' '140' '430' '320' '680' '463' '111' '545'
 '770' '290' '780' '610' '150' '190' '285' '820' '330' '305' '230' '530'
 '135' '197' '310' '287' '270' '370' '131' '281' '450' '133' '750' '280'
 '740' '117' '180' '880' '220' '620' '199' '125' '830' '245' '250' '760'
 '129' '273' '155' '121' '115' '271' '850' '840' '550' '215' '195' '127'
 '123' '730' '461' '670' '630' '171' '870' '720' '890' '910' '240' '825'
 '860' '930' '745' '940' '920']


In [11]:
'''
종로구 : 110
중구 : 140
용산구 : 170
성동구 : 200
광진구 : 215
동대문구 : 230
중랑구 : 260
성북구 : 290
'''
train1[(train1['송하인_시군구명']=='강남구')&(train1['송하인_시도명']=='서울특별시')]

Unnamed: 0,index,송하인_코드1,송하인_시도명,송하인_코드2,송하인_시군구명,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_시도명,수하인_코드2,수하인_시군구명,수하인_코드3,수하인_코드4,수하인_코드5,물품_카테고리,운송장_건수
210,210,11,서울특별시,680,강남구,000,07,002200,50,제주특별자치도,110,제주시,003,73,011400,선글라스/안경테,3
213,213,11,서울특별시,680,강남구,000,15,064300,50,제주특별자치도,110,제주시,002,66,003200,기타화장품/미용,3
505,505,11,서울특별시,680,강남구,000,12,065400,50,제주특별자치도,110,제주시,003,19,002300,상의,3
750,750,11,서울특별시,680,강남구,000,10,070300,50,제주특별자치도,110,제주시,002,13,022200,상의,4
881,881,11,서울특별시,680,강남구,000,12,071300,50,제주특별자치도,130,서귀포시,006,22,057100,농산물,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30659,30659,11,서울특별시,680,강남구,000,16,020100,50,제주특별자치도,130,서귀포시,005,79,077400,기타패션의류,3
30806,30806,11,서울특별시,680,강남구,000,11,040100,50,제주특별자치도,110,제주시,002,66,031300,다이어트식품,5
31514,31514,11,서울특별시,680,강남구,000,32,011100,50,제주특별자치도,110,제주시,003,74,015300,기타패션의류,4
31599,31599,11,서울특별시,680,강남구,000,11,068300,50,제주특별자치도,110,제주시,002,66,060300,주얼리,4


In [12]:
train1[train1['송하인_시도명']=='서울특별시']['송하인_코드2'].sort_values().unique()

array(['110', '140', '170', '200', '215', '230', '260', '290', '305',
       '320', '350', '380', '410', '440', '470', '500', '530', '545',
       '560', '590', '620', '650', '680', '710', '740'], dtype=object)

In [13]:
train1[(train1['송하인_시군구명']=='용산구')&(train1['물품_카테고리']=='기타디지털/가전')]

Unnamed: 0,index,송하인_코드1,송하인_시도명,송하인_코드2,송하인_시군구명,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_시도명,수하인_코드2,수하인_시군구명,수하인_코드3,수하인_코드4,수하인_코드5,물품_카테고리,운송장_건수
152,152,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,002,64,100200,기타디지털/가전,3
403,403,11,서울특별시,170,용산구,000,09,058400,50,제주특별자치도,110,제주시,003,78,017100,기타디지털/가전,3
562,562,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,003,18,029400,기타디지털/가전,3
608,608,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,003,19,033100,기타디지털/가전,5
1046,1046,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,004,35,063100,기타디지털/가전,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31172,31172,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,004,35,073100,기타디지털/가전,4
31197,31197,11,서울특별시,170,용산구,000,09,070100,50,제주특별자치도,130,서귀포시,008,66,019400,기타디지털/가전,4
31376,31376,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,003,74,018100,기타디지털/가전,3
31487,31487,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,002,65,076100,기타디지털/가전,28


In [14]:
train1[(train1['송하인_시군구명']=='용산구')&(train1['물품_카테고리']=='기타디지털/가전')]['송하인_코드4'].sort_values().unique()

array(['08', '09'], dtype=object)

In [15]:
train1[train1['송하인_시군구명']=='용산구']['송하인_코드4'].sort_values().unique()

array(['01', '02', '03', '05', '08', '09', '10', '12', '13', '15', '18',
       '22', '23'], dtype=object)

In [16]:
# train1(5416), train2(5416)으로 나눠서 각자 고유값들을 확인해봤는데요,
# 5515의 경우 1부분이 [0,1]만 존재하는걸로 봐서, 맨 뒷자리 5와 이어진다고 생각했습니다
# 99999에서 100000 으로 넘어가기 때문에 이런식으로 나왔다고 생각했고,
# 우리나라 좌표에 고유번호를 50미터 단위로 지정을 했기 때문에, 정사각형 격자를 만들려면 제곱수가 나와야 한다고 생각했습니다.
# 때문에 6자리수 안에 들어갈 수 있는 데이터는 100만개로 제곱수이며, 5자리수 안에 들어갈 수 있는 데이터는 10만개로 제곱수가 되지 않아 합리적인 수가 아니라고 생각했습니다.
# 또, 6자리수 맨 뒷자리 2자리는 모두 00이어서, 4자리수만 데이터에 의미가 있는 것으로 보입니다. 
# 우리는 5 4 1 6(4) 단위로 끊어서 데이터를 분석하기로 했습니다.

# 4. 모델학습

In [17]:
train = train1[['물품_카테고리', '송하인_코드1', '송하인_코드2', '송하인_코드3', '송하인_코드4','송하인_코드5', '수하인_코드1', '수하인_코드2', '수하인_코드3', '수하인_코드4', '수하인_코드5', '운송장_건수']]
test = test1[['물품_카테고리', '송하인_코드1', '송하인_코드2', '송하인_코드3', '송하인_코드4','송하인_코드5', '수하인_코드1', '수하인_코드2', '수하인_코드3', '수하인_코드4', '수하인_코드5']]


In [18]:
for col in test.columns:
  train[col]=train[col].astype('category')
  test[col]=test[col].astype('category')

In [19]:
X = train.drop(['운송장_건수'],axis=1)
y = train['운송장_건수']
X_test = test.copy()

In [20]:
X.head()

Unnamed: 0,물품_카테고리,송하인_코드1,송하인_코드2,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_코드2,수하인_코드3,수하인_코드4,수하인_코드5
0,음반,50,110,5,95,17300,28,710,1,92,69300
1,문화컨텐츠,41,480,6,90,43300,50,110,2,64,24400
2,농산물,50,110,0,78,68400,11,200,0,7,5400
3,기타식품,41,271,0,48,6400,50,110,5,87,19400
4,농산물,50,110,0,78,68400,28,237,0,10,76300


In [21]:
y.head()

0    3
1    3
2    3
3    7
4    3
Name: 운송장_건수, dtype: int64

In [22]:
X_test.head()

Unnamed: 0,물품_카테고리,송하인_코드1,송하인_코드2,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_코드2,수하인_코드3,수하인_코드4,수하인_코드5
0,선케어,41,670,5,77,42200,50,110,4,35,14100
1,구강위생용품,11,560,0,9,12200,50,110,1,72,34400
2,캠핑,41,220,3,63,57300,50,110,3,61,97300
3,아웃도어가구,50,110,4,36,41400,28,260,0,84,36400
4,분유/이유식/아기간식,41,500,2,41,65200,50,110,1,69,44300


In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31684 entries, 0 to 31683
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   물품_카테고리  31684 non-null  category
 1   송하인_코드1  31684 non-null  category
 2   송하인_코드2  31684 non-null  category
 3   송하인_코드3  31684 non-null  category
 4   송하인_코드4  31684 non-null  category
 5   송하인_코드5  31684 non-null  category
 6   수하인_코드1  31684 non-null  category
 7   수하인_코드2  31684 non-null  category
 8   수하인_코드3  31684 non-null  category
 9   수하인_코드4  31684 non-null  category
 10  수하인_코드5  31684 non-null  category
 11  운송장_건수   31684 non-null  int64   
dtypes: category(11), int64(1)
memory usage: 960.8 KB


In [24]:
def objective(trial):
  param = {
      "random_state":42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.05),
      'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
      "n_estimators":trial.suggest_int("n_estimators", 500, 5000),
      "max_depth":trial.suggest_int("max_depth", 4, 16),
      'random_strength' :trial.suggest_int('random_strength', 0, 100),
      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      "max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
  }
  X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)
  cat_features = range(X_test.shape[1])
  cat = CatBoostRegressor(**param)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_pred = cat.predict(X_valid)
  rmse = np.sqrt(mean_squared_error(y_valid, cat_pred))
  
  return rmse

In [25]:
 sampler = TPESampler(seed=42)
 study = optuna.create_study(
     study_name = 'cat_parameter_opt',
     direction = 'minimize',
     sampler = sampler,
                        )
 
 study.optimize(objective, n_trials=10)
 print("Best Score:",study.best_value)
 print("Best trial",study.best_trial.params)

[32m[I 2022-06-10 14:55:28,371][0m A new study created in memory with name: cat_parameter_opt[0m


0:	learn: 6.7524741	test: 6.7524213	test1: 6.7681804	best: 6.7681804 (0)	total: 64.1ms	remaining: 4m 3s
100:	learn: 6.5076734	test: 6.5181913	test1: 6.5737761	best: 6.5737761 (100)	total: 2.42s	remaining: 1m 28s
200:	learn: 6.4275364	test: 6.4270490	test1: 6.5138475	best: 6.5138475 (200)	total: 4.46s	remaining: 1m 19s
300:	learn: 6.3267546	test: 6.3515341	test1: 6.4609360	best: 6.4609360 (300)	total: 6.72s	remaining: 1m 17s
400:	learn: 6.2384425	test: 6.2966227	test1: 6.4294100	best: 6.4293453 (394)	total: 9.18s	remaining: 1m 17s
500:	learn: 6.1198176	test: 6.2500383	test1: 6.4015868	best: 6.4015868 (500)	total: 11.9s	remaining: 1m 18s
600:	learn: 5.9429298	test: 6.1773317	test1: 6.3658792	best: 6.3658792 (600)	total: 14.6s	remaining: 1m 17s
700:	learn: 5.6880849	test: 6.1011531	test1: 6.3279062	best: 6.3279062 (700)	total: 17.6s	remaining: 1m 17s
800:	learn: 5.2890129	test: 5.9641796	test1: 6.2492531	best: 6.2492531 (800)	total: 21.2s	remaining: 1m 19s
900:	learn: 5.0350440	test: 5.89

[32m[I 2022-06-10 14:56:03,604][0m Trial 0 finished with value: 6.2161721410746 and parameters: {'learning_rate': 0.018272261776066247, 'bagging_temperature': 63.512210106407046, 'n_estimators': 3794, 'max_depth': 11, 'random_strength': 15, 'colsample_bylevel': 0.49359671220172163, 'l2_leaf_reg': 1.7519275289243016e-06, 'min_child_samples': 88, 'max_bin': 380, 'od_type': 'IncToDec'}. Best is trial 0 with value: 6.2161721410746.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 6.216172141
bestIteration = 1119

Shrink model to first 1120 iterations.
0:	learn: 6.7485851	test: 6.7485481	test1: 6.7651274	best: 6.7651274 (0)	total: 16.5ms	remaining: 23.9s
100:	learn: 6.5465128	test: 6.4625579	test1: 6.5582874	best: 6.5582874 (100)	total: 1.02s	remaining: 13.7s
200:	learn: 6.4216055	test: 6.2488161	test1: 6.4434768	best: 6.4422922 (185)	total: 2s	remaining: 12.5s
300:	learn: 6.1844399	test: 6.0183154	test1: 6.2737543	best: 6.2737543 (300)	total: 3.34s	remaining: 12.8s
400:	learn: 6.0035549	test: 5.9040176	test1: 6.2441005	best: 6.2441005 (400)	total: 4.74s	remaining: 12.5s


[32m[I 2022-06-10 14:56:09,524][0m Trial 1 finished with value: 6.225748196602596 and parameters: {'learning_rate': 0.04763628595029446, 'bagging_temperature': 21.368329072358772, 'n_estimators': 1455, 'max_depth': 6, 'random_strength': 18, 'colsample_bylevel': 0.5825453457757226, 'l2_leaf_reg': 1.5747445384650815e-05, 'min_child_samples': 46, 'max_bin': 287, 'od_type': 'IncToDec'}. Best is trial 0 with value: 6.2161721410746.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 6.225748197
bestIteration = 436

Shrink model to first 437 iterations.
0:	learn: 6.9552507	test: 6.9549578	test1: 5.8713680	best: 5.8713680 (0)	total: 117ms	remaining: 4m 58s
100:	learn: 6.5605926	test: 6.6080605	test1: 5.6239030	best: 5.6239030 (100)	total: 6.36s	remaining: 2m 34s
200:	learn: 6.3495612	test: 6.4639098	test1: 5.5045429	best: 5.5045429 (200)	total: 11.6s	remaining: 2m 15s
300:	learn: 6.1797716	test: 6.3689141	test1: 5.4594129	best: 5.4594129 (300)	total: 17.4s	remaining: 2m 9s
400:	learn: 6.0160351	test: 6.3053395	test1: 5.4321608	best: 5.4321344 (399)	total: 23.7s	remaining: 2m 6s
500:	learn: 5.8818165	test: 6.2476211	test1: 5.3947259	best: 5.3945918 (499)	total: 29.4s	remaining: 2m
600:	learn: 5.7074893	test: 6.1925546	test1: 5.3669117	best: 5.3639899 (587)	total: 35.7s	remaining: 1m 55s
700:	learn: 5.4962273	test: 6.1384414	test1: 5.3392289	best: 5.3391012 (698)	total: 42.9s	remaining: 1m 53s
800:	lear

[32m[I 2022-06-10 14:58:16,055][0m Trial 2 finished with value: 5.26079721749937 and parameters: {'learning_rate': 0.016002960978292496, 'bagging_temperature': 0.2920433847181412, 'n_estimators': 2552, 'max_depth': 14, 'random_strength': 20, 'colsample_bylevel': 0.708540663048167, 'l2_leaf_reg': 1.7776512920172654e-05, 'min_child_samples': 9, 'max_bin': 382, 'od_type': 'IncToDec'}. Best is trial 2 with value: 5.26079721749937.[0m


0:	learn: 6.6639071	test: 6.6624983	test1: 7.0849081	best: 7.0849081 (0)	total: 12.5ms	remaining: 51.6s
100:	learn: 6.1626923	test: 6.0238159	test1: 6.3806458	best: 6.3806458 (100)	total: 1.46s	remaining: 58.4s
200:	learn: 6.0178946	test: 5.9424008	test1: 6.2760108	best: 6.2757931 (199)	total: 2.87s	remaining: 56.1s
300:	learn: 5.6901096	test: 5.8006009	test1: 6.1000486	best: 6.0997864 (298)	total: 4.6s	remaining: 58.6s
400:	learn: 5.5016202	test: 5.7734133	test1: 6.0278502	best: 6.0278502 (400)	total: 6.54s	remaining: 1m 1s
500:	learn: 5.3862500	test: 5.7859965	test1: 6.0001828	best: 6.0001828 (500)	total: 8.49s	remaining: 1m 1s


[32m[I 2022-06-10 14:58:25,722][0m Trial 3 finished with value: 5.9974161346452615 and parameters: {'learning_rate': 0.04605136717611768, 'bagging_temperature': 72.86653737491046, 'n_estimators': 4138, 'max_depth': 7, 'random_strength': 9, 'colsample_bylevel': 0.8105398159072941, 'l2_leaf_reg': 1.3210173287250643e-05, 'min_child_samples': 16, 'max_bin': 349, 'od_type': 'Iter'}. Best is trial 2 with value: 5.26079721749937.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 5.997416135
bestIteration = 516

Shrink model to first 517 iterations.
0:	learn: 6.7800430	test: 6.7802684	test1: 6.6493614	best: 6.6493614 (0)	total: 26.1ms	remaining: 49.6s
100:	learn: 6.6448002	test: 6.5809106	test1: 6.4764559	best: 6.4764559 (100)	total: 1.6s	remaining: 28.6s
200:	learn: 6.5582271	test: 6.4697192	test1: 6.3845377	best: 6.3845377 (200)	total: 3.24s	remaining: 27.4s
300:	learn: 6.4588431	test: 6.3730891	test1: 6.3340900	best: 6.3340900 (300)	total: 4.74s	remaining: 25.2s
400:	learn: 6.3906235	test: 6.2987096	test1: 6.2617594	best: 6.2617594 (400)	total: 6.31s	remaining: 23.6s
500:	learn: 6.3232182	test: 6.2253118	test1: 6.1779025	best: 6.1778764 (499)	total: 8.06s	remaining: 22.5s
600:	learn: 6.2476030	test: 6.1296170	test1: 6.1035306	best: 6.1035305 (599)	total: 9.72s	remaining: 21.1s
700:	learn: 6.1681909	test: 6.0429202	test1: 6.0093119	best: 6.0093119 (700)	total: 11.4s	remaining: 19.5s
800:	learn:

[32m[I 2022-06-10 14:58:45,609][0m Trial 4 finished with value: 5.715437274752544 and parameters: {'learning_rate': 0.015166293102182283, 'bagging_temperature': 4.467752817973908, 'n_estimators': 1903, 'max_depth': 10, 'random_strength': 55, 'colsample_bylevel': 0.5109126733153162, 'l2_leaf_reg': 2.9087842986659113e-05, 'min_child_samples': 79, 'max_bin': 482, 'od_type': 'IncToDec'}. Best is trial 2 with value: 5.26079721749937.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 5.715437275
bestIteration = 1007

Shrink model to first 1008 iterations.
0:	learn: 6.6999910	test: 6.7000159	test1: 6.9640173	best: 6.9640173 (0)	total: 11ms	remaining: 15.3s
100:	learn: 6.6075685	test: 6.5696366	test1: 6.8577981	best: 6.8577981 (100)	total: 658ms	remaining: 8.34s
200:	learn: 6.5728600	test: 6.4941392	test1: 6.7756288	best: 6.7756288 (200)	total: 1.34s	remaining: 7.91s
300:	learn: 6.5071500	test: 6.4119897	test1: 6.6779245	best: 6.6779245 (300)	total: 2.08s	remaining: 7.49s
400:	learn: 6.3763646	test: 6.3122384	test1: 6.5407682	best: 6.5407682 (400)	total: 2.83s	remaining: 6.93s
500:	learn: 6.2911701	test: 6.2361086	test1: 6.4519463	best: 6.4519463 (500)	total: 3.69s	remaining: 6.49s
600:	learn: 6.2388423	test: 6.1985332	test1: 6.4130816	best: 6.4127215 (594)	total: 4.56s	remaining: 5.93s
700:	learn: 6.1983934	test: 6.1838290	test1: 6.3864331	best: 6.3861216 (698)	total: 5.42s	remaining: 5.27s
800:	learn

[32m[I 2022-06-10 14:58:52,334][0m Trial 5 finished with value: 6.36075042873299 and parameters: {'learning_rate': 0.04409226795827594, 'bagging_temperature': 0.022592797420156956, 'n_estimators': 1382, 'max_depth': 4, 'random_strength': 32, 'colsample_bylevel': 0.6332063738136893, 'l2_leaf_reg': 8.147757462899138e-06, 'min_child_samples': 84, 'max_bin': 307, 'od_type': 'Iter'}. Best is trial 2 with value: 5.26079721749937.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 6.360750429
bestIteration = 801

Shrink model to first 802 iterations.
0:	learn: 6.7184194	test: 6.7192006	test1: 6.8883797	best: 6.8883797 (0)	total: 413ms	remaining: 5m 44s
100:	learn: 6.5664223	test: 6.5440967	test1: 6.7626070	best: 6.7626070 (100)	total: 9.54s	remaining: 1m 9s
200:	learn: 6.4512174	test: 6.4399009	test1: 6.6978584	best: 6.6978584 (200)	total: 18.4s	remaining: 58.2s
300:	learn: 6.3037427	test: 6.3408570	test1: 6.6262161	best: 6.6260852 (298)	total: 30.3s	remaining: 53.7s
400:	learn: 6.1947555	test: 6.2816859	test1: 6.5995283	best: 6.5995283 (400)	total: 41.5s	remaining: 44.9s
500:	learn: 6.1064856	test: 6.2281536	test1: 6.5773782	best: 6.5773782 (500)	total: 53s	remaining: 35.4s
600:	learn: 5.9901558	test: 6.1757096	test1: 6.5506481	best: 6.5506481 (600)	total: 1m 5s	remaining: 25.7s
700:	learn: 5.9233470	test: 6.1481740	test1: 6.5373541	best: 6.5373541 (700)	total: 1m 15s	remaining: 14.5s
800:	learn:

[32m[I 2022-06-10 15:00:29,308][0m Trial 6 finished with value: 6.493763694432177 and parameters: {'learning_rate': 0.012545899554294089, 'bagging_temperature': 16.172900811143155, 'n_estimators': 835, 'max_depth': 16, 'random_strength': 77, 'colsample_bylevel': 0.5192294089205034, 'l2_leaf_reg': 1.7560829253683595e-07, 'min_child_samples': 83, 'max_bin': 412, 'od_type': 'Iter'}. Best is trial 2 with value: 5.26079721749937.[0m


0:	learn: 6.4773929	test: 6.4770403	test1: 7.7659626	best: 7.7659626 (0)	total: 62.3ms	remaining: 1m 3s
100:	learn: 6.3175085	test: 6.3514943	test1: 7.7072792	best: 7.7072792 (100)	total: 5.61s	remaining: 51.1s
200:	learn: 6.2111840	test: 6.2450708	test1: 7.6596678	best: 7.6596678 (200)	total: 11s	remaining: 45s
300:	learn: 6.1111251	test: 6.1660833	test1: 7.6160412	best: 7.6160402 (299)	total: 16.9s	remaining: 40.5s
400:	learn: 6.0210217	test: 6.1128854	test1: 7.5859467	best: 7.5859467 (400)	total: 22.2s	remaining: 34.4s
500:	learn: 5.9275042	test: 6.0748142	test1: 7.5746241	best: 7.5746241 (500)	total: 29.1s	remaining: 30.2s
600:	learn: 5.8346897	test: 6.0261347	test1: 7.5487369	best: 7.5487369 (600)	total: 36.3s	remaining: 25.3s
700:	learn: 5.7595285	test: 5.9936884	test1: 7.5361233	best: 7.5359327 (699)	total: 42.3s	remaining: 19.3s
800:	learn: 5.6746763	test: 5.9469531	test1: 7.5040120	best: 7.5039808 (796)	total: 49.4s	remaining: 13.6s
900:	learn: 5.5762806	test: 5.8912826	test1:

[32m[I 2022-06-10 15:01:39,854][0m Trial 7 finished with value: 7.396326940547897 and parameters: {'learning_rate': 0.011265617213006592, 'bagging_temperature': 0.27155819552829413, 'n_estimators': 1021, 'max_depth': 15, 'random_strength': 62, 'colsample_bylevel': 0.5985388149115896, 'l2_leaf_reg': 1.9161149250778487e-06, 'min_child_samples': 34, 'max_bin': 297, 'od_type': 'IncToDec'}. Best is trial 2 with value: 5.26079721749937.[0m


0:	learn: 6.8508542	test: 6.8508611	test1: 6.3536690	best: 6.3536690 (0)	total: 6.38ms	remaining: 6.62s
100:	learn: 6.5119646	test: 6.5791910	test1: 6.1643750	best: 6.1643750 (100)	total: 3.31s	remaining: 30.7s
200:	learn: 6.2734275	test: 6.4057154	test1: 6.0115883	best: 6.0115883 (200)	total: 6.79s	remaining: 28.3s
300:	learn: 5.9358168	test: 6.1956358	test1: 5.9238681	best: 5.9215611 (293)	total: 10.2s	remaining: 25s


[32m[I 2022-06-10 15:01:55,444][0m Trial 8 finished with value: 5.826657743295955 and parameters: {'learning_rate': 0.04169990777997927, 'bagging_temperature': 0.7742116473996251, 'n_estimators': 1038, 'max_depth': 13, 'random_strength': 76, 'colsample_bylevel': 0.7367663185416977, 'l2_leaf_reg': 2.3131305726837285e-05, 'min_child_samples': 52, 'max_bin': 357, 'od_type': 'IncToDec'}. Best is trial 2 with value: 5.26079721749937.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 5.826657743
bestIteration = 339

Shrink model to first 340 iterations.
0:	learn: 7.1020595	test: 7.1020050	test1: 5.1393856	best: 5.1393856 (0)	total: 26.9ms	remaining: 1m 30s
100:	learn: 7.0029389	test: 6.9588878	test1: 5.0612082	best: 5.0612082 (100)	total: 1.65s	remaining: 53.5s
200:	learn: 6.9455832	test: 6.8993790	test1: 5.0354498	best: 5.0354498 (199)	total: 3.09s	remaining: 48.6s
300:	learn: 6.9026077	test: 6.8281525	test1: 5.0146957	best: 5.0146954 (297)	total: 4.6s	remaining: 46.8s
400:	learn: 6.8571224	test: 6.7944638	test1: 5.0082616	best: 5.0082616 (399)	total: 6.1s	remaining: 45s
500:	learn: 6.8292077	test: 6.7542120	test1: 5.0014150	best: 5.0014058 (497)	total: 7.46s	remaining: 42.7s
600:	learn: 6.7863665	test: 6.6980497	test1: 4.9935764	best: 4.9935764 (600)	total: 9.11s	remaining: 41.9s
700:	learn: 6.7519529	test: 6.6467137	test1: 4.9902621	best: 4.9902397 (699)	total: 10.7s	remaining: 40.6s
800:	learn: 6

[32m[I 2022-06-10 15:02:10,096][0m Trial 9 finished with value: 4.988470496035556 and parameters: {'learning_rate': 0.011896326393433528, 'bagging_temperature': 0.0133572404119741, 'n_estimators': 3364, 'max_depth': 8, 'random_strength': 51, 'colsample_bylevel': 0.9445398843556558, 'l2_leaf_reg': 7.486273952174759e-06, 'min_child_samples': 44, 'max_bin': 427, 'od_type': 'IncToDec'}. Best is trial 9 with value: 4.988470496035556.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 4.988470496
bestIteration = 894

Shrink model to first 895 iterations.
Best Score: 4.988470496035556
Best trial {'learning_rate': 0.011896326393433528, 'bagging_temperature': 0.0133572404119741, 'n_estimators': 3364, 'max_depth': 8, 'random_strength': 51, 'colsample_bylevel': 0.9445398843556558, 'l2_leaf_reg': 7.486273952174759e-06, 'min_child_samples': 44, 'max_bin': 427, 'od_type': 'IncToDec'}


# 5. 모델 평가

In [26]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['운송장_건수']):
  folds.append((train_idx,valid_idx))

In [27]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['운송장_건수']):
  folds.append((train_idx,valid_idx))

random.seed(42)
cat_models={}

cat_features =range(X_test.shape[1])

for fold in range(10):
  print(f'===================================={fold+1}============================================')
  train_idx, valid_idx = folds[fold]
  X_train = train.drop(['운송장_건수'],axis=1).iloc[train_idx]
  X_valid = train.drop(['운송장_건수'],axis=1).iloc[valid_idx]
  y_train = train['운송장_건수'][train_idx].values
  y_valid = train['운송장_건수'][valid_idx].values

  cat = CatBoostRegressor(**study.best_trial.params)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_models[fold] = cat
  print(f'================================================================================\n\n')

0:	learn: 6.7671093	test: 6.7671093	test1: 6.6589182	best: 6.6589182 (0)	total: 2.95ms	remaining: 9.93s
100:	learn: 6.6696175	test: 6.6527896	test1: 6.5613560	best: 6.5613560 (100)	total: 1.89s	remaining: 1m 1s
200:	learn: 6.6304382	test: 6.6052572	test1: 6.5249459	best: 6.5249459 (200)	total: 3.34s	remaining: 52.6s
300:	learn: 6.5871662	test: 6.5612186	test1: 6.4959259	best: 6.4957707 (299)	total: 5.12s	remaining: 52.1s
400:	learn: 6.5560622	test: 6.5328865	test1: 6.4806741	best: 6.4806741 (400)	total: 6.75s	remaining: 49.9s
500:	learn: 6.5246846	test: 6.5067667	test1: 6.4549715	best: 6.4549715 (500)	total: 8.57s	remaining: 49s
600:	learn: 6.4967476	test: 6.4843313	test1: 6.4425521	best: 6.4425521 (600)	total: 10.4s	remaining: 47.7s
700:	learn: 6.4791952	test: 6.4695110	test1: 6.4328095	best: 6.4327704 (698)	total: 11.9s	remaining: 45.4s
800:	learn: 6.4465727	test: 6.4522518	test1: 6.4208294	best: 6.4208294 (800)	total: 13.6s	remaining: 43.5s
900:	learn: 6.4269443	test: 6.4367691	test

In [28]:
submission.loc[:,'INVC_CONT']=0
for fold in range(10):
  submission.loc[:,'INVC_CONT'] += cat_models[fold].predict(test)/10

In [29]:
submission.loc[submission.운송장_건수>30,'운송장_건수']=submission.loc[submission.운송장_건수>30,'운송장_건수']*4.8
submission.to_csv('data/submission/마지막 확인용.csv',index = False)

# + 가설 적용
> 이 데이터는 [제주도에서 보낸 물류] 또는 [제주도로 보낸 물류]로 큰 범주가 나뉜다.  
> 데이터를 [제주도에서 보낸 데이터] 와 [제주도로 보낸 물류]로 데이터를 나눠서 학습시키는건 어떨까?  
> 데이터를 나누지 않고, 제주도에서 보냈는지, 제주도로 보냈는지 0 1로 카테고리를 나눠서 세팅해주는건 어떨까?  

### 데이터 준비

In [30]:
train1 = train[(train['송하인_코드1']=='50') & (train['수하인_코드1'] != '50')]
test2 = test[(test['송하인_코드1']=='50') & (test['수하인_코드1'] != '50')]

train2 = train[(train['수하인_코드1']=='50') & (train['송하인_코드1'] != '50')]
test2 = test[(test['수하인_코드1']=='50') & (test['송하인_코드1'] != '50')]

train3 = train[(train['송하인_코드1']=='50') & (train['수하인_코드1']=='50')]
test3 = test[(test['송하인_코드1']=='50') & (test['수하인_코드1']=='50')]

# Mean target encoding 연습

In [18]:
train.head()

Unnamed: 0,물품_카테고리,송하인_코드1,송하인_코드2,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_코드2,수하인_코드3,수하인_코드4,수하인_코드5,운송장_건수
0,음반,50,110,5,95,17300,28,710,1,92,69300,3
1,문화컨텐츠,41,480,6,90,43300,50,110,2,64,24400,3
2,농산물,50,110,0,78,68400,11,200,0,7,5400,3
3,기타식품,41,271,0,48,6400,50,110,5,87,19400,7
4,농산물,50,110,0,78,68400,28,237,0,10,76300,3


In [27]:
# Mean target encoding 연습
# target 설정
target = '운송장_건수'
for i in range(1,6):
  globals()[f'send{i}_mean'] = train.groupby(f'송하인_코드{i}')[target].mean()
  train[globals()[f'send{i}_mean']] = train[f'송하인_코드{i}'].map(f'send{i}_mean') # 가존 변수에 encoded 된 값을 매핑




TypeError: 'str' object is not callable

In [21]:
# 가존 변수에 encoded 된 값을 매핑
train['send1_mean'] = train['송하인_코드1'].map(send1_mean)


Unnamed: 0,송하인_코드1,send1_mean
0,50,4.542944
1,41,5.691149
2,50,4.542944
3,41,5.691149
4,50,4.542944


In [22]:
train

Unnamed: 0,물품_카테고리,송하인_코드1,송하인_코드2,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_코드2,수하인_코드3,수하인_코드4,수하인_코드5,운송장_건수,send1_mean
0,음반,50,110,005,95,017300,28,710,001,92,069300,3,4.542944
1,문화컨텐츠,41,480,006,90,043300,50,110,002,64,024400,3,5.691149
2,농산물,50,110,000,78,068400,11,200,000,07,005400,3,4.542944
3,기타식품,41,271,000,48,006400,50,110,005,87,019400,7,5.691149
4,농산물,50,110,000,78,068400,28,237,000,10,076300,3,4.542944
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31679,스포츠잡화,44,710,002,90,087200,50,110,002,13,073200,3,8.516588
31680,스마트디바이스,11,290,000,14,045300,50,110,003,19,087100,4,5.445993
31681,스마트디바이스,11,290,000,14,045300,50,110,002,63,065200,6,5.445993
31682,지갑,41,273,000,65,073100,50,110,002,64,061200,7,5.691149
