# Import

In [1]:
import pandas as pd
import numpy as np
import gc
import os
from sklearn.preprocessing import LabelEncoder
import requests
from dateutil.parser import parse
from datetime import date, datetime, time

# to parquet

In [2]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [3]:
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


# Data load

In [2]:
from email.utils import parsedate

train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

In [3]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,road_in_use,lane_count,road_rating,road_name,multi_linked,connect_code,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,0,1,106,지방도1112호선,0,0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,0,2,103,일반국도11호선,0,0,...,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,0,2,103,일반국도16호선,0,0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,0,2,107,태평로,0,0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,0,2,103,일반국도12호선,0,0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


## weather

In [4]:
def weather(pageno, pagesize, location):
    url = 'http://apis.data.go.kr/1360000/AsosHourlyInfoService/getWthrDataList'
    params ={
        'serviceKey' : "/xks6pL5Kc8ECmrvgE/kCuKk7S47gJpVQHkJtpR373/Gj+ilPL6oQgJz6utjiWFALXUgEVbIfm4lzinVI69/MA==",
        'pageNo' : f'{pageno}',
        'numOfRows': f'{pagesize}',
        'dataType' : 'JSON', 
        'dataCd' : 'ASOS', 
        'dateCd' : 'HR',
        'startDt' : '20210901',
        'startHh' : '00', 
        'endDt' : '20220831', 
        'endHh' : '23',
        'stnIds' : f'{location}' }

    response = requests.get(url, params=params)
    data = response.json().get('response').get('body').get('items').get('item')
    df = pd.DataFrame(data)
    return df

In [5]:
name_dict = {'tm' : '시간',
             'rnum' : '목록 순서',
             'stnld' : '지점 번호',
             'stnNm' : '관측소',
             'ta' : '기온',
             'taQcflg' : '기온 품질검사',
             'rn' : '강수량',
             'rnQCflg' : '강수량 품질검사',
             'ws' : '풍속',
             'wsQcflg' : '풍속 품질검사',
             'wd' : '풍향',
             'wdQcflg' : '풍향 품질검사',
             'hm' : '습도',
             'hmQcflg' : '습도 품질검사',
             'pv' : '증기압',
             'td' : '이슬점온도',
             'pa' : '현지기압',
             'paQcflg' : '현지기압 품질검사',
             'ps' : '해면기압',
             'psQcflg' : '해면기압 품질검사',
             'ss' : '일조',
             'ssQcflg' : '일조 품질검사',
             'icsr' : '일사',
             'dsnw' : '적설',
             'hr3snow' : '3시간신적설',
             'dc10Tca' : '전운량',
             'dc10LmcsCa' : '중하층운량',
             'clfmAbbrCd' : '운형',
             'lcsChgTm' : '최저운고',
             'vs' : '시정',
             'gndSttCd' : '지면상태',
             'dmstMtphNo' : '현상번호',
             'ts' : '지면온도',
             'tsQcflg' : '지면온도 품질검사',
             'm005Te' : '5cm 지중온도',
             'm01Te' : '10cm 지중온도',
             'm02Te' : '20cm 지중온도',
             'm03Te' : '30cm 지중온도'
            }

In [6]:
center_dic = {184 : 'jeju',
              185 : 'gosan',
              188 : 'seongsan',
              189 : 'seogwipo'}

for i in center_dic.keys():
    dataset = []
    for j in range(1, 10) :
        data = weather(j, 999, i)
        dataset.append(data)
        globals()[f'{center_dic[i]}'] = pd.concat(dataset)
        # reset_index
        globals()[f'{center_dic[i]}'].reset_index(drop=True, inplace=True)
        # 컬럼명 변경
        globals()[f'{center_dic[i]}'].columns = name_dict.values()
        # 시간
        globals()[f'{center_dic[i]}']['시간'] = pd.to_datetime(globals()[f'{center_dic[i]}']['시간'])

# 날씨 데이터 합치기
weather_df = pd.DataFrame()
weather_df['시간'] = jeju['시간']
weather_df['제주 강수량'] = jeju['강수량']
weather_df['제주 적설'] = jeju['적설']
weather_df['고산 강수량'] = gosan['강수량']
weather_df['고산 적설'] = gosan['적설']
weather_df['성산 강수량'] = seongsan['강수량']
weather_df['성산 적설'] = seongsan['적설']
weather_df['서귀포 강수량'] = seogwipo['강수량']
weather_df['서귀포 적설'] = seogwipo['적설']

# replace, fillna
weather_df.replace('', np.nan, inplace=True)
weather_df.fillna(0, inplace=True)

# column 타입 변경
weather_df = weather_df.astype({'제주 강수량' : 'float64',
                                '제주 적설' : 'float64',
                                '고산 강수량' : 'float64',
                                '고산 적설' : 'float64',
                                '성산 강수량' : 'float64',
                                '성산 적설' : 'float64',
                                '서귀포 강수량' : 'float64',
                                '서귀포 적설' : 'float64'})

# Preprocess

In [7]:
# 시간 만들기 함수
def make_time(df):
    timelist = []
    df['base_date'] = df['base_date'].astype('str')
    for i in range(len(df)):
        timelist.append(datetime.combine(parse(df['base_date'][i]), time(df['base_hour'][i])))
    return timelist

In [8]:
# 만들고 병합
train['시간'] = make_time(train)
test['시간'] = make_time(test)
train = pd.merge(train, weather_df, on='시간', how='left')
test = pd.merge(test, weather_df, on='시간', how='left')

In [9]:
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [10]:
y_train = train['target'] 
X_train = train.drop(['id','base_date', 'target', 'road_name', 'start_node_name', 'end_node_name', 'vehicle_restricted', '시간'], axis=1)
test = test.drop(['id','base_date', 'road_name', 'start_node_name', 'end_node_name', 'vehicle_restricted', '시간'], axis=1)

print(X_train.shape)
print(y_train.shape)
print(test.shape)

(4701217, 25)
(4701217,)
(291241, 25)


# Autokeras

In [11]:
import tensorflow as tf
import autokeras as ak

In [None]:
# It tries 3 different models.
reg = ak.StructuredDataRegressor(
    column_names=[
        'day_of_week',
        'base_hour',
        'road_in_use',
        'lane_count',
        'road_rating',
        'multi_linked',
        'connect_code',
        'maxium_speed_limit',
        'weight_restricted',
        'height_restricted',
        'road_type',
        'start_latitude',
        'start_longitude',
        'start_turn_restricted',
        'end_latitude',
        'end_longitude',
        'end_turn_restricted',
        '제주 강수량',
        '제주 적설',
        '고산 강수량',
        '고산 적설',
        '성산 강수량',
        '성산 적설',
        '서귀포 강수량',
        '서귀포 적설',
    ],
    column_types={
        'day_of_week': 'categorical', # OHE
        'base_hour': 'categorical', # LE
        'road_in_use': 'categorical', # OHE
        'lane_count' : 'numerical',
        'road_rating' : 'categorical', # LE
        'multi_linked' : 'categorical', # OHE
        'connect_code' : 'categorical', # OHE
        'maxium_speed_limit' : 'numerical', 
        'weight_restricted' : 'categorical', # LE
        'height_restricted' : 'categorical', # LE
        'road_type' : 'categorical', # OHE 
        'start_latitude' : 'numerical',
        'start_longitude' : 'numerical',
        'start_turn_restricted' : 'categorical', # OHE
        'end_latitude' : 'numerical',
        'end_longitude' : 'numerical',
        'end_turn_restricted' : 'categorical', # OHE
        '제주 강수량' : 'numerical',
        '제주 적설' : 'numerical',
        '고산 강수량' : 'numerical',
        '고산 적설' : 'numerical',
        '성산 강수량' : 'numerical',
        '성산 적설' : 'numerical',
        '서귀포 강수량' : 'numerical',
        '서귀포 적설' : 'numerical'},
    max_trials=5,
    overwrite=True,
)
# Feed the structured data regressor with training data.
reg.fit(X_train, y_train, validation_split=0.15, epochs=10)

model = reg.export_model()
model.summary()

# 모델 저장
try:
    model.save("model_autokeras", save_format="tf")
except Exception:
    model.save("model_autokeras.h5")

In [None]:
loaded_model = tf.keras.models.load_model("model_autokeras", custom_objects=ak.CUSTOM_OBJECTS)

In [19]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 25)]              0         
                                                                 
 multi_category_encoding (Mu  (None, 25)               0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 25)               51        
 n)                                                              
                                                                 
 dense (Dense)               (None, 32)                832       
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056  

In [18]:
pred = model.predict(test)

# 결과 저장
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['target'] = pred
sample_submission.to_csv("./submit.csv", index = False)

sample_submission



Unnamed: 0,id,target
0,TEST_000000,28.355175
1,TEST_000001,44.837040
2,TEST_000002,61.980816
3,TEST_000003,35.273144
4,TEST_000004,44.210144
...,...,...
291236,TEST_291236,50.531822
291237,TEST_291237,53.113892
291238,TEST_291238,23.301003
291239,TEST_291239,25.477617
