# Import

In [1]:
import pandas as pd
import numpy as np
import gc
import os
from sklearn.preprocessing import LabelEncoder
import requests
from dateutil.parser import parse
from datetime import date, datetime, time
from tqdm import tqdm

# Data load

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,road_in_use,lane_count,road_rating,road_name,multi_linked,connect_code,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,0,1,106,지방도1112호선,0,0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,0,2,103,일반국도11호선,0,0,...,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,0,2,103,일반국도16호선,0,0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,0,2,107,태평로,0,0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,0,2,103,일반국도12호선,0,0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


In [4]:
train.columns

Index(['id', 'base_date', 'day_of_week', 'base_hour', 'road_in_use',
       'lane_count', 'road_rating', 'road_name', 'multi_linked',
       'connect_code', 'maximum_speed_limit', 'vehicle_restricted',
       'weight_restricted', 'height_restricted', 'road_type',
       'start_node_name', 'start_latitude', 'start_longitude',
       'start_turn_restricted', 'end_node_name', 'end_latitude',
       'end_longitude', 'end_turn_restricted', 'target'],
      dtype='object')

## weather data

In [5]:
# weather request
def weather(pageno, pagesize, location):
    url = 'http://apis.data.go.kr/1360000/AsosHourlyInfoService/getWthrDataList'
    params ={
        'serviceKey' : "/xks6pL5Kc8ECmrvgE/kCuKk7S47gJpVQHkJtpR373/Gj+ilPL6oQgJz6utjiWFALXUgEVbIfm4lzinVI69/MA==",
        'pageNo' : f'{pageno}',
        'numOfRows': f'{pagesize}',
        'dataType' : 'JSON', 
        'dataCd' : 'ASOS', 
        'dateCd' : 'HR',
        'startDt' : '20210901',
        'startHh' : '00', 
        'endDt' : '20220831', 
        'endHh' : '23',
        'stnIds' : f'{location}' }

    response = requests.get(url, params=params)
    data = response.json().get('response').get('body').get('items').get('item')
    df = pd.DataFrame(data)
    return df

In [6]:
name_dict = {'tm' : '시간',
             'rnum' : '목록 순서',
             'stnld' : '지점 번호',
             'stnNm' : '관측소',
             'ta' : '기온',
             'taQcflg' : '기온 품질검사',
             'rn' : '강수량',
             'rnQCflg' : '강수량 품질검사',
             'ws' : '풍속',
             'wsQcflg' : '풍속 품질검사',
             'wd' : '풍향',
             'wdQcflg' : '풍향 품질검사',
             'hm' : '습도',
             'hmQcflg' : '습도 품질검사',
             'pv' : '증기압',
             'td' : '이슬점온도',
             'pa' : '현지기압',
             'paQcflg' : '현지기압 품질검사',
             'ps' : '해면기압',
             'psQcflg' : '해면기압 품질검사',
             'ss' : '일조',
             'ssQcflg' : '일조 품질검사',
             'icsr' : '일사',
             'dsnw' : '적설',
             'hr3snow' : '3시간신적설',
             'dc10Tca' : '전운량',
             'dc10LmcsCa' : '중하층운량',
             'clfmAbbrCd' : '운형',
             'lcsChgTm' : '최저운고',
             'vs' : '시정',
             'gndSttCd' : '지면상태',
             'dmstMtphNo' : '현상번호',
             'ts' : '지면온도',
             'tsQcflg' : '지면온도 품질검사',
             'm005Te' : '5cm 지중온도',
             'm01Te' : '10cm 지중온도',
             'm02Te' : '20cm 지중온도',
             'm03Te' : '30cm 지중온도'
            }

In [7]:
center_dic = {184 : 'jeju',
              185 : 'gosan',
              188 : 'seongsan',
              189 : 'seogwipo'}

for i in center_dic.keys():
    dataset = []
    for j in range(1, 10) :
        data = weather(j, 999, i)
        dataset.append(data)
        globals()[f'{center_dic[i]}'] = pd.concat(dataset)
        # reset_index
        globals()[f'{center_dic[i]}'].reset_index(drop=True, inplace=True)
        # 컬럼명 변경
        globals()[f'{center_dic[i]}'].columns = name_dict.values()
        # 시간
        globals()[f'{center_dic[i]}']['시간'] = pd.to_datetime(globals()[f'{center_dic[i]}']['시간'])

# 날씨 데이터 합치기
weather_df = pd.DataFrame()
weather_df['시간'] = jeju['시간']
weather_df['제주 강수량'] = jeju['강수량']
weather_df['제주 적설'] = jeju['적설']
weather_df['고산 강수량'] = gosan['강수량']
weather_df['고산 적설'] = gosan['적설']
weather_df['성산 강수량'] = seongsan['강수량']
weather_df['성산 적설'] = seongsan['적설']
weather_df['서귀포 강수량'] = seogwipo['강수량']
weather_df['서귀포 적설'] = seogwipo['적설']

# replace, fillna
weather_df.replace('', np.nan, inplace=True)
weather_df.fillna(0, inplace=True)

# column 타입 변경
weather_df = weather_df.astype({'제주 강수량' : 'float64',
                                '제주 적설' : 'float64',
                                '고산 강수량' : 'float64',
                                '고산 적설' : 'float64',
                                '성산 강수량' : 'float64',
                                '성산 적설' : 'float64',
                                '서귀포 강수량' : 'float64',
                                '서귀포 적설' : 'float64'})

# Preprocess

In [8]:
# 날씨 데이터와 병합을 위한 시간 만들기 함수
def make_time(df):
    timelist = []
    df['base_date'] = df['base_date'].astype('str')
    for i in tqdm(range(len(df))):
        timelist.append(datetime.combine(parse(df['base_date'][i]), time(df['base_hour'][i])))
    return timelist

# 병합
train['시간'] = make_time(train)
test['시간'] = make_time(test)
train = pd.merge(train, weather_df, on='시간', how='left')
test = pd.merge(test, weather_df, on='시간', how='left')

100%|██████████| 4701217/4701217 [03:54<00:00, 20016.52it/s]
100%|██████████| 291241/291241 [00:14<00:00, 19927.61it/s]


In [9]:
# 시간 -> sin 변환 함수
from sklearn.preprocessing import FunctionTransformer

def sin_transformer(period):
	return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def sin_transform(df):
	# 날짜 -> day of year
	df['day_of_year'] = df['시간'].apply(lambda x : x.strftime('%j'))
	df['day_of_year'] = df['day_of_year'].astype('int')

	# 날짜 -> day of week
	df['day_of_week'] = df['시간'].apply(lambda x : x.weekday())

	# 변환
	df['base_hour'] = sin_transformer(24).fit_transform(df['base_hour'].values.reshape(-1, 1))
	df['day_of_year'] = sin_transformer(365).fit_transform(df['day_of_year'].values.reshape(-1, 1))
	df['day_of_week'] = sin_transformer(7).fit_transform(df['day_of_week'].values.reshape(-1, 1))
	return df

train = sin_transform(train)
test = sin_transform(test)

In [10]:
# 위도, 경도 -> Radian 변환 함수
def radianm_transformer(df):
    df['start_latitude'] = df['start_latitude'] * np.pi / 180
    df['start_longitude'] = df['start_longitude'] * np.pi / 180
    df['end_latitude'] = df['end_latitude'] * np.pi / 180
    df['end_longitude'] = df['end_longitude'] * np.pi / 180
    return df

# 변환
train = radianm_transformer(train)
test = radianm_transformer(test)

In [12]:
# OneHot Encoding
train = pd.get_dummies(train, columns=['road_in_use', 'road_rating', 'road_type', 'start_turn_restricted', 'end_turn_restricted'], prefix=['road_in_use', 'road_rating', 'road_type', 'start_turn_restricted', 'end_turn_restricted'], drop_first=False)
test = pd.get_dummies(test, columns=['road_in_use', 'road_rating', 'road_type', 'start_turn_restricted', 'end_turn_restricted'], prefix=['road_in_use', 'road_rating', 'road_type', 'start_turn_restricted', 'end_turn_restricted'], drop_first=False)

In [13]:
# 공휴일 데이터
from pytimekr import pytimekr
kr_holidays_2021 = pytimekr.holidays(2021)
kr_holidays_2022 = pytimekr.holidays(2022)

def make_holidays(df):
    holidays = []
    for i in tqdm(range(len(df))):
        if df['시간'][i].date() in kr_holidays_2021:
            holidays.append(1)
        elif df['시간'][i].date() in kr_holidays_2022:
            holidays.append(1)
        else:
            holidays.append(0)
    return holidays

# 공휴일 데이터 추가
train['holidays'] = make_holidays(train)
test['holidays'] = make_holidays(test)

print(train['holidays'].value_counts())
print(test['holidays'].value_counts())

# 주말 추가
def make_weekend(df):
    for i in tqdm(range(len(df))):
        if df['day_of_week'][i] == sin_transformer(7).fit_transform(5) or df['day_of_week'][i] == sin_transformer(7).fit_transform(6):
            df['holidays'][i] = 1
    return df

train = make_weekend(train)
test = make_weekend(test)

print(train['holidays'].value_counts())
print(test['holidays'].value_counts())

# 휴일 OneHot Encoding
train = pd.get_dummies(train, columns=['holidays'], prefix=['holidays'], drop_first=False)
test = pd.get_dummies(test, columns=['holidays'], prefix=['holidays'], drop_first=False)

100%|██████████| 4701217/4701217 [02:57<00:00, 26450.80it/s]
100%|██████████| 291241/291241 [00:08<00:00, 35937.85it/s]


0    4445197
1     256020
Name: holidays, dtype: int64
0    281962
1      9279
Name: holidays, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['holidays'][i] = 1
100%|██████████| 4701217/4701217 [08:29<00:00, 9222.86it/s] 
100%|██████████| 291241/291241 [00:25<00:00, 11524.22it/s]


0    3193528
1    1507689
Name: holidays, dtype: int64
0    207564
1     83677
Name: holidays, dtype: int64


In [14]:
# 사용하지 않을 변수 제거
train = train.drop(['id', 'road_name', 'start_node_name', 'end_node_name', 'vehicle_restricted', '시간', 'base_date'], axis=1)
test = test.drop(['id', 'road_name', 'start_node_name', 'end_node_name', 'vehicle_restricted', '시간', 'base_date'], axis=1)

print(train.shape)
print(test.shape)

(4701217, 35)
(291241, 34)


In [17]:
# csv 파일로 저장
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [18]:
train.head()

Unnamed: 0,day_of_week,base_hour,lane_count,multi_linked,connect_code,maximum_speed_limit,weight_restricted,height_restricted,start_latitude,start_longitude,...,road_rating_106,road_rating_107,road_type_0,road_type_3,start_turn_restricted_없음,start_turn_restricted_있음,end_turn_restricted_없음,end_turn_restricted_있음,holidays_0,holidays_1
0,0.433884,-0.965926,1,0,0,60.0,32400.0,0.0,0.583424,2.21068,...,1,0,0,1,1,0,1,0,1,0
1,0.433884,-0.707107,2,0,0,60.0,0.0,0.0,0.584698,2.20835,...,0,0,1,0,0,1,1,0,1,0
2,-0.781831,0.965926,2,0,0,80.0,0.0,0.0,0.580831,2.205548,...,0,0,1,0,1,0,1,0,0,1
3,-0.433884,-0.258819,2,0,0,50.0,0.0,0.0,0.580254,2.209014,...,0,1,1,0,1,0,1,0,1,0
4,0.781831,0.866025,2,0,0,80.0,0.0,0.0,0.584026,2.204814,...,0,0,1,0,1,0,1,0,1,0


In [19]:
test.head()

Unnamed: 0,day_of_week,base_hour,lane_count,multi_linked,connect_code,maximum_speed_limit,weight_restricted,height_restricted,start_latitude,start_longitude,...,road_rating_106,road_rating_107,road_type_0,road_type_3,start_turn_restricted_없음,start_turn_restricted_있음,end_turn_restricted_없음,end_turn_restricted_있음,holidays_0,holidays_1
0,0.433884,-0.9659258,3,0,0,70.0,0.0,0.0,0.584675,2.208562,...,0,1,1,0,1,0,0,1,1,0
1,0.781831,1.224647e-16,2,0,0,70.0,0.0,0.0,0.58047,2.206567,...,0,0,0,1,1,0,1,0,1,0
2,-0.433884,0.5,1,0,0,60.0,0.0,0.0,0.580478,2.207431,...,0,0,1,0,1,0,1,0,1,0
3,0.433884,-0.258819,3,0,0,70.0,0.0,0.0,0.584223,2.208638,...,0,0,1,0,1,0,1,0,1,0
4,0.974928,-0.9659258,3,0,0,70.0,0.0,0.0,0.584711,2.20905,...,1,0,1,0,1,0,1,0,1,0


In [None]:
# Scaling X, 인코딩하지 않은 것만, 전부 시도해보기
# 다중공산성 제거해보기