In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings(action='ignore')

In [149]:
weather_2018 = pd.read_csv('2018_w.csv', sep=',', encoding='cp949')
weather_2019 = pd.read_csv('2019_w.csv', sep=',', encoding='cp949')
weather_2020 = pd.read_csv('2020_w.csv', sep=',', encoding='cp949')
weather_2021 = pd.read_csv('2021_w.csv', sep=',', encoding='cp949')

In [150]:
weather_2019

Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),일조(hr),일사(MJ/m2),시정(10m),지면온도(°C)
0,108,서울,2019-09-11 00:00,22.8,0.6,1.8,360,95,26.3,21.9,1001.6,1011.4,,,408,23.5
1,108,서울,2019-09-11 01:00,22.8,2.2,1.7,360,97,26.8,22.2,1001.6,1011.4,,,145,23.6
2,108,서울,2019-09-11 02:00,22.6,2.0,2.5,360,97,26.4,22.0,1001.3,1011.1,,,405,23.4
3,108,서울,2019-09-11 03:00,21.9,2.3,1.6,360,97,25.3,21.3,1001.9,1011.8,,,240,23.0
4,108,서울,2019-09-11 04:00,21.2,0.9,1.8,360,97,24.3,20.6,1002.2,1012.1,,,2000,22.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,159,부산,2019-09-14 19:00,24.6,,1.4,230,74,22.8,19.6,1001.9,1009.8,0.3,0.03,2000,25.2
188,159,부산,2019-09-14 20:00,23.9,,0.6,250,78,23.1,19.8,1002.5,1010.4,,,2000,24.6
189,159,부산,2019-09-14 21:00,23.7,,1.8,250,81,23.7,20.2,1002.8,1010.7,,,2000,24.2
190,159,부산,2019-09-14 22:00,23.3,,2.5,200,83,23.7,20.2,1003.3,1011.2,,,2000,23.9


In [80]:
weather_2018.columns

Index(['지점', '지점명', '일시', '기온(°C)', '강수량(mm)', '풍속(m/s)', '풍향(16방위)', '습도(%)',
       '증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)', '일조(hr)',
       '일사(MJ/m2)', '시정(10m)', '지면온도(°C)'],
      dtype='object')

In [151]:
busan_cols = []
seoul_cols = []

for i in weather_2018.columns:
    busan_cols.append('부산 '+i)
    seoul_cols.append('서울 '+i)
busan_cols, seoul_cols

(['부산 지점',
  '부산 지점명',
  '부산 일시',
  '부산 기온(°C)',
  '부산 강수량(mm)',
  '부산 풍속(m/s)',
  '부산 풍향(16방위)',
  '부산 습도(%)',
  '부산 증기압(hPa)',
  '부산 이슬점온도(°C)',
  '부산 현지기압(hPa)',
  '부산 해면기압(hPa)',
  '부산 일조(hr)',
  '부산 일사(MJ/m2)',
  '부산 시정(10m)',
  '부산 지면온도(°C)'],
 ['서울 지점',
  '서울 지점명',
  '서울 일시',
  '서울 기온(°C)',
  '서울 강수량(mm)',
  '서울 풍속(m/s)',
  '서울 풍향(16방위)',
  '서울 습도(%)',
  '서울 증기압(hPa)',
  '서울 이슬점온도(°C)',
  '서울 현지기압(hPa)',
  '서울 해면기압(hPa)',
  '서울 일조(hr)',
  '서울 일사(MJ/m2)',
  '서울 시정(10m)',
  '서울 지면온도(°C)'])

In [152]:
weather_data = pd.DataFrame()

for data in [weather_2018, weather_2019, weather_2020, weather_2021]:
    busan_weather = data.loc[data['지점']==159]
    busan_weather.reset_index(inplace=True, drop=True)
    busan_weather.columns = busan_cols
    
    seoul_weather = data.loc[data['지점']==108]
    seoul_weather.columns = seoul_cols
    
    new_data = pd.DataFrame(seoul_weather.loc[:, '서울 일시'])
    new_data.columns = ['일시']

    weather = pd.concat([seoul_weather.iloc[:, 3:], busan_weather.iloc[:, 3:]], axis=1)
    new_data = pd.concat([new_data, weather], axis=1)
    weather_data = pd.concat([weather_data, new_data], axis=0)

In [153]:
weather_data.to_csv('weather_data.csv', index=False)

In [154]:
time_data = pd.DataFrame()

for year in ['2018', '2019', '2020', '2021']:
    data = pd.read_csv(year+'_1.csv', sep=',', encoding='cp949')
    for i in [-1, -2, -3, -4]:
        new_data = data.iloc[:, i]
        time_data = pd.concat([time_data, new_data], axis=0)
time_data.columns = ['소요시간']

In [155]:
weather_data.shape, time_data.shape

((384, 27), (384, 1))

In [156]:
weather_data.reset_index(inplace=True, drop=True)
time_data.reset_index(inplace=True, drop=True)

In [157]:
data = weather_data.copy()
data = pd.concat([data, time_data], axis=1)
pd.set_option('display.max_columns', data.shape[0])
data

Unnamed: 0,일시,서울 기온(°C),서울 강수량(mm),서울 풍속(m/s),서울 풍향(16방위),서울 습도(%),서울 증기압(hPa),서울 이슬점온도(°C),서울 현지기압(hPa),서울 해면기압(hPa),서울 일조(hr),서울 일사(MJ/m2),서울 시정(10m),서울 지면온도(°C),부산 기온(°C),부산 강수량(mm),부산 풍속(m/s),부산 풍향(16방위),부산 습도(%),부산 증기압(hPa),부산 이슬점온도(°C),부산 현지기압(hPa),부산 해면기압(hPa),부산 일조(hr),부산 일사(MJ/m2),부산 시정(10m),부산 지면온도(°C),소요시간
0,2018-09-22 00:00,19.5,,1.1,250.0,81,18.3,16.1,1002.0,1011.9,,,2000,18.8,19.3,,3.9,340.0,94,21.0,18.3,1003.2,1011.2,,,2000,19.4,4:29
1,2018-09-22 01:00,19.5,,1.5,230.0,83,18.8,16.5,1002.0,1011.9,,,1459,18.9,19.3,,2.1,340.0,94,21.0,18.3,1003.5,1011.5,,,2000,19.5,4:28
2,2018-09-22 02:00,19.4,,0.0,0.0,85,19.1,16.8,1001.8,1011.7,,,1533,18.4,19.2,,4.8,360.0,92,20.4,17.8,1003.8,1011.9,,,2000,19.2,4:32
3,2018-09-22 03:00,18.8,,1.0,290.0,90,19.5,17.1,1002.3,1012.2,,,1191,17.7,19.3,,3.8,340.0,90,20.1,17.6,1004.3,1012.4,,,1924,19.0,4:34
4,2018-09-22 04:00,18.9,0.0,0.8,340.0,88,19.1,16.8,1002.2,1012.1,,,1226,18.9,19.3,,3.2,360.0,88,19.6,17.2,1004.5,1012.6,,,2000,18.5,4:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,2021-09-22 19:00,21.3,,2.8,250.0,72,18.2,16.0,999.6,1009.5,0.1,0.04,2000,20.2,22.9,,2.9,250.0,73,20.2,17.7,1000.9,1008.8,0.0,0.02,5000,22.8,04:30
380,2021-09-22 20:00,20.7,,3.0,250.0,75,18.3,16.1,1000.6,1010.5,,,2000,19.6,22.7,,3.9,270.0,78,21.4,18.6,1001.7,1009.7,,,3983,22.2,04:10
381,2021-09-22 21:00,20.5,,3.6,250.0,77,18.5,16.3,1001.5,1011.4,,,2000,19.4,22.7,,2.9,270.0,84,23.1,19.8,1002.3,1010.3,,,2676,22.1,04:38
382,2021-09-22 22:00,19.7,,2.9,270.0,75,17.2,15.1,1002.1,1012.0,,,2000,18.7,23.0,,2.4,270.0,86,24.1,20.5,1002.6,1010.5,,,1946,22.2,0


In [158]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 28 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   일시            384 non-null    object 
 1   서울 기온(°C)     384 non-null    float64
 2   서울 강수량(mm)    54 non-null     float64
 3   서울 풍속(m/s)    384 non-null    float64
 4   서울 풍향(16방위)   384 non-null    float64
 5   서울 습도(%)      384 non-null    int64  
 6   서울 증기압(hPa)   384 non-null    float64
 7   서울 이슬점온도(°C)  384 non-null    float64
 8   서울 현지기압(hPa)  384 non-null    float64
 9   서울 해면기압(hPa)  384 non-null    float64
 10  서울 일조(hr)     208 non-null    float64
 11  서울 일사(MJ/m2)  208 non-null    float64
 12  서울 시정(10m)    384 non-null    int64  
 13  서울 지면온도(°C)   384 non-null    float64
 14  부산 기온(°C)     384 non-null    float64
 15  부산 강수량(mm)    23 non-null     float64
 16  부산 풍속(m/s)    378 non-null    float64
 17  부산 풍향(16방위)   378 non-null    float64
 18  부산 습도(%)      384 non-null    

In [159]:
data['서울 풍향(16방위)'] = np.radians(data['서울 풍향(16방위)'])
data['부산 풍향(16방위)'] = np.radians(data['부산 풍향(16방위)'])

In [160]:
data.fillna(0, inplace=True)

In [161]:
data.isna().sum()

일시              0
서울 기온(°C)       0
서울 강수량(mm)      0
서울 풍속(m/s)      0
서울 풍향(16방위)     0
서울 습도(%)        0
서울 증기압(hPa)     0
서울 이슬점온도(°C)    0
서울 현지기압(hPa)    0
서울 해면기압(hPa)    0
서울 일조(hr)       0
서울 일사(MJ/m2)    0
서울 시정(10m)      0
서울 지면온도(°C)     0
부산 기온(°C)       0
부산 강수량(mm)      0
부산 풍속(m/s)      0
부산 풍향(16방위)     0
부산 습도(%)        0
부산 증기압(hPa)     0
부산 이슬점온도(°C)    0
부산 현지기압(hPa)    0
부산 해면기압(hPa)    0
부산 일조(hr)       0
부산 일사(MJ/m2)    0
부산 시정(10m)      0
부산 지면온도(°C)     0
소요시간            0
dtype: int64

In [162]:
data['소요시간'] = data['소요시간'].replace('0', np.NaN)

In [163]:
for index, row in data.iterrows():
    tmp = row['소요시간']
    if ':' not in str(tmp):
        continue
    h, m = str(tmp).split(':')
    row['소요시간'] = int(h)*60+int(m)
    data.iloc[index] = row

In [165]:
data['소요시간'] = data['소요시간'].astype(float)

In [169]:
tmp = data.iloc[:, 1:]
tmp

Unnamed: 0,서울 기온(°C),서울 강수량(mm),서울 풍속(m/s),서울 풍향(16방위),서울 습도(%),서울 증기압(hPa),서울 이슬점온도(°C),서울 현지기압(hPa),서울 해면기압(hPa),서울 일조(hr),서울 일사(MJ/m2),서울 시정(10m),서울 지면온도(°C),부산 기온(°C),부산 강수량(mm),부산 풍속(m/s),부산 풍향(16방위),부산 습도(%),부산 증기압(hPa),부산 이슬점온도(°C),부산 현지기압(hPa),부산 해면기압(hPa),부산 일조(hr),부산 일사(MJ/m2),부산 시정(10m),부산 지면온도(°C),소요시간
0,19.5,0.0,1.1,4.363323,81,18.3,16.1,1002.0,1011.9,0.0,0.00,2000,18.8,19.3,0.0,3.9,5.934119,94,21.0,18.3,1003.2,1011.2,0.0,0.00,2000,19.4,269.0
1,19.5,0.0,1.5,4.014257,83,18.8,16.5,1002.0,1011.9,0.0,0.00,1459,18.9,19.3,0.0,2.1,5.934119,94,21.0,18.3,1003.5,1011.5,0.0,0.00,2000,19.5,268.0
2,19.4,0.0,0.0,0.000000,85,19.1,16.8,1001.8,1011.7,0.0,0.00,1533,18.4,19.2,0.0,4.8,6.283185,92,20.4,17.8,1003.8,1011.9,0.0,0.00,2000,19.2,272.0
3,18.8,0.0,1.0,5.061455,90,19.5,17.1,1002.3,1012.2,0.0,0.00,1191,17.7,19.3,0.0,3.8,5.934119,90,20.1,17.6,1004.3,1012.4,0.0,0.00,1924,19.0,274.0
4,18.9,0.0,0.8,5.934119,88,19.1,16.8,1002.2,1012.1,0.0,0.00,1226,18.9,19.3,0.0,3.2,6.283185,88,19.6,17.2,1004.5,1012.6,0.0,0.00,2000,18.5,263.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,21.3,0.0,2.8,4.363323,72,18.2,16.0,999.6,1009.5,0.1,0.04,2000,20.2,22.9,0.0,2.9,4.363323,73,20.2,17.7,1000.9,1008.8,0.0,0.02,5000,22.8,270.0
380,20.7,0.0,3.0,4.363323,75,18.3,16.1,1000.6,1010.5,0.0,0.00,2000,19.6,22.7,0.0,3.9,4.712389,78,21.4,18.6,1001.7,1009.7,0.0,0.00,3983,22.2,250.0
381,20.5,0.0,3.6,4.363323,77,18.5,16.3,1001.5,1011.4,0.0,0.00,2000,19.4,22.7,0.0,2.9,4.712389,84,23.1,19.8,1002.3,1010.3,0.0,0.00,2676,22.1,278.0
382,19.7,0.0,2.9,4.712389,75,17.2,15.1,1002.1,1012.0,0.0,0.00,2000,18.7,23.0,0.0,2.4,4.712389,86,24.1,20.5,1002.6,1010.5,0.0,0.00,1946,22.2,


In [170]:
imputer = KNNImputer()
tmp = pd.DataFrame(imputer.fit_transform(tmp), columns=tmp.columns)

In [171]:
data['소요시간'] = tmp['소요시간']

In [172]:
data.isna().sum()

일시              0
서울 기온(°C)       0
서울 강수량(mm)      0
서울 풍속(m/s)      0
서울 풍향(16방위)     0
서울 습도(%)        0
서울 증기압(hPa)     0
서울 이슬점온도(°C)    0
서울 현지기압(hPa)    0
서울 해면기압(hPa)    0
서울 일조(hr)       0
서울 일사(MJ/m2)    0
서울 시정(10m)      0
서울 지면온도(°C)     0
부산 기온(°C)       0
부산 강수량(mm)      0
부산 풍속(m/s)      0
부산 풍향(16방위)     0
부산 습도(%)        0
부산 증기압(hPa)     0
부산 이슬점온도(°C)    0
부산 현지기압(hPa)    0
부산 해면기압(hPa)    0
부산 일조(hr)       0
부산 일사(MJ/m2)    0
부산 시정(10m)      0
부산 지면온도(°C)     0
소요시간            0
dtype: int64

In [173]:
data.to_csv('data.csv', index=False)