# 1. Import

In [1]:
# 연산 처리를 위한 패키지
import numpy as np
import pandas as pd
from pandas import DataFrame
from math import sqrt

# 데이터 분석을 위한 패키지
import statsmodels.api as sm

# 시각화를 위한 패키지
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# 필요모듈 import
import os
import openpyxl
from datetime import datetime
from tqdm import tqdm

# 그래프를 실제로 그리기 위한 설정
%matplotlib inline

# 머신러닝 패키지
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm.sklearn import LGBMRegressor
import random
import optuna
from optuna.samplers import TPESampler
# KFold(CV), partial : optuna를 사용하기 위함
from sklearn.model_selection import KFold
from functools import partial

# 폰트 처리
# plt.rc('font', family='NanumGothic')        # for windows
plt.rc('font', family='AppleGothic') # For MacOS

import warnings
warnings.filterwarnings('ignore')

# 2.1. 데이터 로딩

In [2]:
# 뉴 데이터 로딩
test = pd.read_csv('data/final_test.csv', index_col = 0)             # unnamed_0 이라는 index가 추가되어 나오지 않게 'index_col = 0' 을 추가했습니다.
train = pd.read_csv('data/final_train.csv', index_col = 0)

submission = pd.read_csv('data/sample_submission.csv')

train2 = train.copy()
train.head()

Unnamed: 0,index,송하인_격자공간고유번호,송하인_격자공간명,송하인_시도코드,송하인_시도명,송하인_시군구코드,송하인_시군구명,수하인_격자공간고유번호,수하인_격자공간명,수하인_시도코드,수하인_시도명,수하인_시군구코드,수하인_시군구명,물품_카테고리,운송장_건수
0,0,5011000595017300,다나1395,50,제주특별자치도,50110,제주시,2871000192069300,다사1072,28,인천광역시,28710,강화군,음반,3
1,1,4148000690043300,다사2868,41,경기도,41480,파주시,5011000264024400,다다0901,50,제주특별자치도,50110,제주시,문화컨텐츠,3
2,2,5011000078068400,다다3007,50,제주특별자치도,50110,제주시,1120000007005400,다사5950,11,서울특별시,11200,성동구,농산물,3
3,3,4127100048006400,다사4521,41,경기도,41271,안산시상록구,5011000587019400,다나0595,50,제주특별자치도,50110,제주시,기타식품,7
4,4,5011000078068400,다다3007,50,제주특별자치도,50110,제주시,2823700010076300,다사3145,28,인천광역시,28237,부평구,농산물,3


In [3]:
def setting_data(train, test, one, two, three, four, five, six):
  test['송하인_격자공간고유번호'] = test['송하인_격자공간고유번호'].astype(str)
  test['수하인_격자공간고유번호'] = test['수하인_격자공간고유번호'].astype(str)
  train['송하인_격자공간고유번호'] = train['송하인_격자공간고유번호'].astype(str)
  train['수하인_격자공간고유번호'] = train['수하인_격자공간고유번호'].astype(str)

  train['송하인_코드1'] = train['송하인_격자공간고유번호'].str.slice(int(one),int(two))
  train['송하인_코드2'] = train['송하인_격자공간고유번호'].str.slice(int(two),int(three))
  train['송하인_코드3'] = train['송하인_격자공간고유번호'].str.slice(int(three),int(four))
  train['송하인_코드4'] = train['송하인_격자공간고유번호'].str.slice(int(four),int(five))
  train['송하인_코드5'] = train['송하인_격자공간고유번호'].str.slice(int(five),int(six))

  train['수하인_코드1'] = train['수하인_격자공간고유번호'].str.slice(int(one),int(two))
  train['수하인_코드2'] = train['수하인_격자공간고유번호'].str.slice(int(two),int(three))
  train['수하인_코드3'] = train['수하인_격자공간고유번호'].str.slice(int(three),int(four))
  train['수하인_코드4'] = train['수하인_격자공간고유번호'].str.slice(int(four),int(five))
  train['수하인_코드5'] = train['수하인_격자공간고유번호'].str.slice(int(five),int(six))

  test['송하인_코드1'] = test['송하인_격자공간고유번호'].str.slice(int(one),int(two))
  test['송하인_코드2'] = test['송하인_격자공간고유번호'].str.slice(int(two),int(three))
  test['송하인_코드3'] = test['송하인_격자공간고유번호'].str.slice(int(three),int(four))
  test['송하인_코드4'] = test['송하인_격자공간고유번호'].str.slice(int(four),int(five))
  test['송하인_코드5'] = test['송하인_격자공간고유번호'].str.slice(int(five),int(six))

  test['수하인_코드1'] = test['수하인_격자공간고유번호'].str.slice(int(one),int(two))
  test['수하인_코드2'] = test['수하인_격자공간고유번호'].str.slice(int(two),int(three))
  test['수하인_코드3'] = test['수하인_격자공간고유번호'].str.slice(int(three),int(four))
  test['수하인_코드4'] = test['수하인_격자공간고유번호'].str.slice(int(four),int(five))
  test['수하인_코드5'] = test['수하인_격자공간고유번호'].str.slice(int(five),int(six))

  train = train[['index', 
                  '송하인_코드1','송하인_시도명',
                  '송하인_코드2', '송하인_시군구명',
                  '송하인_코드3', '송하인_코드4', '송하인_코드5',
                  
                  '수하인_코드1', '수하인_시도명',
                  '수하인_코드2', '수하인_시군구명',
                  '수하인_코드3', '수하인_코드4', '수하인_코드5',
                  '물품_카테고리', 
                  '운송장_건수']]

  test = test[['index',
                '송하인_코드1','송하인_시도명',
                '송하인_코드2', '송하인_시군구명', 
                '송하인_코드3', '송하인_코드4', '송하인_코드5',

                '수하인_코드1', '수하인_시도명',
                '수하인_코드2', '수하인_시군구명',
                '수하인_코드3', '수하인_코드4', '수하인_코드5',
                '물품_카테고리',
                ]]
                
  return train, test


In [4]:
def confirm_nun(train):
  print('송하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.')
  print('코드_1 :', train['송하인_코드1'].nunique())
  print('코드_2 :', train['송하인_코드2'].nunique())
  print('코드_3 :', train['송하인_코드3'].nunique())
  print('코드_4 :', train['송하인_코드4'].nunique())
  print('코드_5 :', train['송하인_코드5'].nunique())

  print("*"*50)

  print('수하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.')
  print('코드_1 :', train['수하인_코드1'].nunique())
  print('코드_2 :', train['수하인_코드2'].nunique())
  print('코드_3 :', train['수하인_코드3'].nunique())
  print('코드_4 :', train['수하인_코드4'].nunique())
  print('코드_5 :', train['수하인_코드5'].nunique())

  print('*'*50)
  print('수하인_코드4의 고유값은')
  print(train['수하인_코드4'].unique())


In [5]:
train1, test1 = setting_data(train, test, '0', '2', '5', '8', '10', '16')       # 0, 2, 5, 9, 10, 16번째 위치에서 코드를 잘라서 저장합니다. 이걸로 결정
train2, test2 = setting_data(train, test, '0', '2', '5', '10', '11', '16')      # 0, 2, 5, 10, 11, 16번째 위치에서 코드를 잘라서 저장합니다.

### train3은 16자리중 15,16번째 숫자들을 쳐내는 과정입니다.
### 전부 00 이라는 값을 가지기 때문에 학습할 때 제거해도 상관없다고 생각합니다.

In [6]:
train1

Unnamed: 0,index,송하인_코드1,송하인_시도명,송하인_코드2,송하인_시군구명,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_시도명,수하인_코드2,수하인_시군구명,수하인_코드3,수하인_코드4,수하인_코드5,물품_카테고리,운송장_건수
0,0,50,제주특별자치도,110,제주시,005,95,017300,28,인천광역시,710,강화군,001,92,069300,음반,3
1,1,41,경기도,480,파주시,006,90,043300,50,제주특별자치도,110,제주시,002,64,024400,문화컨텐츠,3
2,2,50,제주특별자치도,110,제주시,000,78,068400,11,서울특별시,200,성동구,000,07,005400,농산물,3
3,3,41,경기도,271,안산시상록구,000,48,006400,50,제주특별자치도,110,제주시,005,87,019400,기타식품,7
4,4,50,제주특별자치도,110,제주시,000,78,068400,28,인천광역시,237,부평구,000,10,076300,농산물,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31679,31679,44,충청남도,710,금산군,002,90,087200,50,제주특별자치도,110,제주시,002,13,073200,스포츠잡화,3
31680,31680,11,서울특별시,290,성북구,000,14,045300,50,제주특별자치도,110,제주시,003,19,087100,스마트디바이스,4
31681,31681,11,서울특별시,290,성북구,000,14,045300,50,제주특별자치도,110,제주시,002,63,065200,스마트디바이스,6
31682,31682,41,경기도,273,안산시단원구,000,65,073100,50,제주특별자치도,110,제주시,002,64,061200,지갑,7


In [7]:
confirm_nun(train1)

송하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.
코드_1 : 17
코드_2 : 99
코드_3 : 14
코드_4 : 100
코드_5 : 400
**************************************************
수하인 코드 1, 2, 3, 4, 5는 각각 ~ 종류의 코드가 존재합니다.
코드_1 : 17
코드_2 : 101
코드_3 : 18
코드_4 : 100
코드_5 : 400
**************************************************
수하인_코드4의 고유값은
['92' '64' '07' '87' '10' '14' '69' '17' '23' '26' '16' '20' '63' '02'
 '27' '21' '96' '09' '48' '31' '03' '18' '06' '39' '24' '55' '04' '33'
 '35' '72' '56' '79' '30' '12' '59' '99' '41' '15' '62' '73' '32' '65'
 '70' '82' '74' '94' '46' '19' '78' '29' '52' '25' '85' '42' '13' '01'
 '08' '28' '05' '90' '53' '60' '66' '11' '77' '88' '00' '67' '98' '54'
 '44' '37' '36' '38' '81' '84' '58' '34' '68' '40' '57' '71' '83' '45'
 '49' '76' '47' '51' '43' '93' '61' '22' '86' '95' '50' '75' '97' '89'
 '91' '80']


In [8]:
print('*'*50)
print('수하인_코드2의 고유값은')
print(test1['수하인_코드2'].unique())

**************************************************
수하인_코드2의 고유값은
['110' '260' '130' '710' '480' '350' '465' '200' '210' '185' '173' '170'
 '570' '560' '440' '810' '470' '113' '237' '410' '390' '650' '590' '800'
 '500' '380' '900' '790' '360' '140' '430' '320' '680' '463' '111' '545'
 '770' '290' '780' '610' '150' '190' '285' '820' '330' '305' '230' '530'
 '135' '197' '310' '287' '270' '370' '131' '281' '450' '133' '750' '280'
 '740' '117' '180' '880' '220' '620' '199' '125' '830' '245' '250' '760'
 '129' '273' '155' '121' '115' '271' '850' '840' '550' '215' '195' '127'
 '123' '730' '461' '670' '630' '171' '870' '720' '890' '910' '240' '825'
 '860' '930' '745' '940' '920']


In [9]:
'''
종로구 : 110
중구 : 140
용산구 : 170
성동구 : 200
광진구 : 215
동대문구 : 230
중랑구 : 260
성북구 : 290
'''
train1[(train1['송하인_시군구명']=='강남구')&(train1['송하인_시도명']=='서울특별시')]

Unnamed: 0,index,송하인_코드1,송하인_시도명,송하인_코드2,송하인_시군구명,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_시도명,수하인_코드2,수하인_시군구명,수하인_코드3,수하인_코드4,수하인_코드5,물품_카테고리,운송장_건수
210,210,11,서울특별시,680,강남구,000,07,002200,50,제주특별자치도,110,제주시,003,73,011400,선글라스/안경테,3
213,213,11,서울특별시,680,강남구,000,15,064300,50,제주특별자치도,110,제주시,002,66,003200,기타화장품/미용,3
505,505,11,서울특별시,680,강남구,000,12,065400,50,제주특별자치도,110,제주시,003,19,002300,상의,3
750,750,11,서울특별시,680,강남구,000,10,070300,50,제주특별자치도,110,제주시,002,13,022200,상의,4
881,881,11,서울특별시,680,강남구,000,12,071300,50,제주특별자치도,130,서귀포시,006,22,057100,농산물,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30659,30659,11,서울특별시,680,강남구,000,16,020100,50,제주특별자치도,130,서귀포시,005,79,077400,기타패션의류,3
30806,30806,11,서울특별시,680,강남구,000,11,040100,50,제주특별자치도,110,제주시,002,66,031300,다이어트식품,5
31514,31514,11,서울특별시,680,강남구,000,32,011100,50,제주특별자치도,110,제주시,003,74,015300,기타패션의류,4
31599,31599,11,서울특별시,680,강남구,000,11,068300,50,제주특별자치도,110,제주시,002,66,060300,주얼리,4


In [10]:
train1[train1['송하인_시도명']=='서울특별시']['송하인_코드2'].sort_values().unique()

array(['110', '140', '170', '200', '215', '230', '260', '290', '305',
       '320', '350', '380', '410', '440', '470', '500', '530', '545',
       '560', '590', '620', '650', '680', '710', '740'], dtype=object)

In [11]:
train1[(train1['송하인_시군구명']=='용산구')&(train1['물품_카테고리']=='기타디지털/가전')]

Unnamed: 0,index,송하인_코드1,송하인_시도명,송하인_코드2,송하인_시군구명,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_시도명,수하인_코드2,수하인_시군구명,수하인_코드3,수하인_코드4,수하인_코드5,물품_카테고리,운송장_건수
152,152,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,002,64,100200,기타디지털/가전,3
403,403,11,서울특별시,170,용산구,000,09,058400,50,제주특별자치도,110,제주시,003,78,017100,기타디지털/가전,3
562,562,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,003,18,029400,기타디지털/가전,3
608,608,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,003,19,033100,기타디지털/가전,5
1046,1046,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,004,35,063100,기타디지털/가전,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31172,31172,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,004,35,073100,기타디지털/가전,4
31197,31197,11,서울특별시,170,용산구,000,09,070100,50,제주특별자치도,130,서귀포시,008,66,019400,기타디지털/가전,4
31376,31376,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,003,74,018100,기타디지털/가전,3
31487,31487,11,서울특별시,170,용산구,000,09,037100,50,제주특별자치도,110,제주시,002,65,076100,기타디지털/가전,28


In [12]:
train1[(train1['송하인_시군구명']=='용산구')&(train1['물품_카테고리']=='기타디지털/가전')]['송하인_코드4'].sort_values().unique()

array(['08', '09'], dtype=object)

In [13]:
train1[train1['송하인_시군구명']=='용산구']['송하인_코드4'].sort_values().unique()

array(['01', '02', '03', '05', '08', '09', '10', '12', '13', '15', '18',
       '22', '23'], dtype=object)

In [14]:
# train1(5416), train2(5416)으로 나눠서 각자 고유값들을 확인해봤는데요,
# 5515의 경우 1부분이 [0,1]만 존재하는걸로 봐서, 맨 뒷자리 5와 이어진다고 생각했습니다
# 99999에서 100000 으로 넘어가기 때문에 이런식으로 나왔다고 생각했고,
# 우리나라 좌표에 고유번호를 50미터 단위로 지정을 했기 때문에, 정사각형 격자를 만들려면 제곱수가 나와야 한다고 생각했습니다.
# 때문에 6자리수 안에 들어갈 수 있는 데이터는 100만개로 제곱수이며, 5자리수 안에 들어갈 수 있는 데이터는 10만개로 제곱수가 되지 않아 합리적인 수가 아니라고 생각했습니다.
# 또, 6자리수 맨 뒷자리 2자리는 모두 00이어서, 4자리수만 데이터에 의미가 있는 것으로 보입니다. 
# 우리는 5 4 1 6(4) 단위로 끊어서 데이터를 분석하기로 했습니다.

In [15]:
train = train1[['물품_카테고리', '송하인_코드1', '송하인_코드2', '송하인_코드3', '송하인_코드4','송하인_코드5', '수하인_코드1', '수하인_코드2', '수하인_코드3', '수하인_코드4', '수하인_코드5', '운송장_건수']]
test = test1[['물품_카테고리', '송하인_코드1', '송하인_코드2', '송하인_코드3', '송하인_코드4','송하인_코드5', '수하인_코드1', '수하인_코드2', '수하인_코드3', '수하인_코드4', '수하인_코드5']]


# 70기준 빼서 학습시켜보자

In [16]:
for col in test.columns:
  train[col]=train[col].astype('category')
  test[col]=test[col].astype('category')

In [17]:
train.head()

Unnamed: 0,물품_카테고리,송하인_코드1,송하인_코드2,송하인_코드3,송하인_코드4,송하인_코드5,수하인_코드1,수하인_코드2,수하인_코드3,수하인_코드4,수하인_코드5,운송장_건수
0,음반,50,110,5,95,17300,28,710,1,92,69300,3
1,문화컨텐츠,41,480,6,90,43300,50,110,2,64,24400,3
2,농산물,50,110,0,78,68400,11,200,0,7,5400,3
3,기타식품,41,271,0,48,6400,50,110,5,87,19400,7
4,농산물,50,110,0,78,68400,28,237,0,10,76300,3


In [18]:
X = train.drop(['운송장_건수'],axis=1)
y = train['운송장_건수']
X_test = test.copy()

# catboost

In [19]:
'''
def objective(trial):
  param = {
      "random_state":42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.05),
      'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
      "n_estimators":trial.suggest_int("n_estimators", 500, 5000),
      "max_depth":trial.suggest_int("max_depth", 4, 16),
      'random_strength' :trial.suggest_int('random_strength', 0, 100),
      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      "max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
  }
  X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)
  cat_features = range(X_test.shape[1])
  cat = CatBoostRegressor(**param)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_pred = cat.predict(X_valid)
  rmse = np.sqrt(mean_squared_error(y_valid, cat_pred))
  return rmse

'''

'\ndef objective(trial):\n  param = {\n      "random_state":42,\n      \'learning_rate\' : trial.suggest_loguniform(\'learning_rate\', 0.01, 0.05),\n      \'bagging_temperature\' :trial.suggest_loguniform(\'bagging_temperature\', 0.01, 100.00),\n      "n_estimators":trial.suggest_int("n_estimators", 500, 5000),\n      "max_depth":trial.suggest_int("max_depth", 4, 16),\n      \'random_strength\' :trial.suggest_int(\'random_strength\', 0, 100),\n      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),\n      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),\n      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),\n      "max_bin": trial.suggest_int("max_bin", 200, 500),\n      \'od_type\': trial.suggest_categorical(\'od_type\', [\'IncToDec\', \'Iter\']),\n  }\n  X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)\n  cat_features = range(X_test.shape[1])\n  cat = CatBoostRegressor(**param)\n  cat.fit(X_train, y_train,\

In [20]:
'''
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)
'''

'\nsampler = TPESampler(seed=42)\nstudy = optuna.create_study(\n    study_name = \'cat_parameter_opt\',\n    direction = \'minimize\',\n    sampler = sampler,\n)\nstudy.optimize(objective, n_trials=10)\nprint("Best Score:",study.best_value)\nprint("Best trial",study.best_trial.params)\n'

In [21]:
# Catboost 결과

# [I 2022-06-14 09:48:36,571] Trial 9 finished with value: 5.725697967772534 and parameters: {'learning_rate': 0.011896326393433528, 'bagging_temperature': 0.0133572404119741, 'n_estimators': 3364, 'max_depth': 8, 'random_strength': 51, 'colsample_bylevel': 0.9445398843556558, 'l2_leaf_reg': 7.486273952174759e-06, 'min_child_samples': 44, 'max_bin': 427, 'od_type': 'IncToDec'}. Best is trial 5 with value: 4.622928122178303.
# Stopped by overfitting detector  (35 iterations wait)
# 
# bestTest = 5.725697968
# bestIteration = 411
# 
# Shrink model to first 412 iterations.
# Best Score: 4.622928122178303
# Best trial {'learning_rate': 0.04409226795827594, 'bagging_temperature': 0.022592797420156956, 'n_estimators': 1382, 'max_depth': 4, 'random_strength': 32, 'colsample_bylevel': 0.6332063738136893, 'l2_leaf_reg': 8.147757462899138e-06, 'min_child_samples': 84, 'max_bin': 307, 'od_type': 'Iter'}

In [22]:
cat_param={'learning_rate': 0.04409226795827594, 
            'bagging_temperature': 0.022592797420156956, 
            'n_estimators': 1382, 
            'max_depth': 4, 
            'random_strength': 32, 
            'colsample_bylevel': 0.6332063738136893, 
            'l2_leaf_reg': 8.147757462899138e-06, 
            'min_child_samples': 84, 
            'max_bin': 307, 
            'od_type': 'Iter'}

In [23]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['운송장_건수']):
  folds.append((train_idx,valid_idx))

In [24]:
random.seed(42)
cat_models={}

cat_features =range(X_test.shape[1])

for fold in range(10):
  print(f'===================================={fold+1}============================================')
  train_idx, valid_idx = folds[fold]
  X_train = train.drop(['운송장_건수'],axis=1).iloc[train_idx]
  X_valid = train.drop(['운송장_건수'],axis=1).iloc[valid_idx]
  y_train = train['운송장_건수'][train_idx].values
  y_valid = train['운송장_건수'][valid_idx].values

  cat = CatBoostRegressor(**cat_param)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_models[fold] = cat
  print(f'================================================================================\n\n')

0:	learn: 6.7637619	test: 6.7633471	test1: 6.6556283	best: 6.6556283 (0)	total: 68.5ms	remaining: 1m 34s
100:	learn: 6.6433714	test: 6.6113719	test1: 6.5206210	best: 6.5206210 (100)	total: 941ms	remaining: 11.9s
200:	learn: 6.5976837	test: 6.5677778	test1: 6.4844652	best: 6.4844652 (200)	total: 1.74s	remaining: 10.2s
300:	learn: 6.5259517	test: 6.4600701	test1: 6.4175723	best: 6.4158282 (299)	total: 2.67s	remaining: 9.58s
400:	learn: 6.4309108	test: 6.3484195	test1: 6.3064497	best: 6.3018283 (381)	total: 3.53s	remaining: 8.63s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 6.301828281
bestIteration = 381

Shrink model to first 382 iterations.


0:	learn: 6.6268911	test: 6.6262718	test1: 7.8046215	best: 7.8046215 (0)	total: 14.3ms	remaining: 19.8s
100:	learn: 6.4799163	test: 6.3944126	test1: 7.5560040	best: 7.5560040 (100)	total: 841ms	remaining: 10.7s
200:	learn: 6.4457610	test: 6.3282891	test1: 7.4992802	best: 7.4992802 (199)	total: 1.58s	remaining: 9.29s
300:	learn

In [25]:
submission.loc[:,'운송장_건수']=0
for fold in range(10):
  submission.loc[:,'운송장_건수'] += cat_models[fold].predict(test)/10

In [26]:
df_new = train1.drop_duplicates(subset = "운송장_건수", keep = "first")
sub_new = submission.drop_duplicates(subset = "운송장_건수", keep = "first")

In [27]:
print(sorted(df_new['운송장_건수']))

[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 58, 59, 60, 61, 62, 63, 66, 67, 68, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 89, 90, 91, 93, 100, 103, 105, 108, 109, 118, 120, 122, 123, 130, 134, 150, 151, 160, 164, 179, 195, 197, 211, 239, 413]


In [28]:
sub_new['운송장_건수'].sort_values()

5796     3.215109
2778     3.418880
3902     3.427199
4969     3.428776
3196     3.428819
          ...    
2186    20.170095
1944    20.218253
7089    23.402692
4937    26.919251
1762    46.476129
Name: 운송장_건수, Length: 7920, dtype: float64

In [29]:
# 동떨어진 데이터 하나가 있다. 
# 원래 데이터에서도 동떨어진 데이터가 하나 있었다.
# 맞춰보자.

In [30]:
submission.loc[submission.운송장_건수>30,'운송장_건수']

1762    46.476129
Name: 운송장_건수, dtype: float64

In [31]:
submission.loc[submission.운송장_건수>30,'운송장_건수'].mean()

46.47612946656219

In [32]:
df_new.loc[df_new.운송장_건수>160,'운송장_건수']

3898     197
8585     164
17281    211
23053    179
24463    195
25394    413
30090    239
Name: 운송장_건수, dtype: int64

In [33]:
df_new.loc[df_new.운송장_건수>160,'운송장_건수'].mean()

228.28571428571428

In [34]:
# 413이 최대였는데, 46.476129가 최대가 나왔으므로 맞춰준다
228.28571428571428 / 46.47612946656219

4.911891693777064

In [35]:
submission.loc[submission.운송장_건수>30,'운송장_건수']=submission.loc[submission.운송장_건수>30,'운송장_건수']*4.911891693777064
submission.to_csv('5416cat_submission_30mean_multiple_4_91189.csv',index = False)