# 데이터 전처리

## 라이브러리 선언

In [1]:
import numpy as np
import pandas as pd

## 함수 선언

In [2]:
def clean_up_data(origin_df: pd.DataFrame, start_date=None, end_date=None) -> pd.DataFrame:
    """
    원본 데이터프레임에서 불필요한 행과 열을 제거하는 함수
    origin_df: 원본 데이터프레임
    start_date: 맨 윗 열의 '일자' 인덱스 (MM-DD)
    end_date: 맨 아랫 열의 '일자' 인덱스 (MM-DD)
    weather_df: 원본 데이터프레임에서 전처리된 데이터프레임, 함수 반환값
    """
    
    weather_df = origin_df.copy()

    del weather_df['지점']
    del weather_df['지점명']
    weather_df = weather_df.fillna(0.0)

    weather_df = detach_year(weather_df)

    weather_df = weather_df.set_index('일자')
    
    return weather_df.loc[start_date : end_date]

In [3]:
def detach_year(origin_df: pd.DataFrame) -> pd.DataFrame:
    """
    원본 데이터프레임의 '일자' 열에 속한 데이터에서 연도와 월일을 분리
    origin_df: 원본 데이터프레임 (한글)
    detached_df: 원본 데이터프레임의 사본 (원본 데이터프레임 변형 방지 목적), 함수 반환값
    """

    detached_df = origin_df.copy()

    for row in detached_df.iterrows():
        date = row[1]['일자']
        detached_df.at[row[0], '일자'] = date[5:]

    return detached_df

In [4]:
def detach_month(origin_df: pd.DataFrame) -> pd.DataFrame:
    """
    원본 데이터프레임의 '일자' 열에 속한 데이터에서 월과 일을 분리
    origin_df: 원본 데이터프레임 (영문)
    detached_df: 원본 데이터프레임의 사본 (원본 데이터프레임 변형 방지 목적), 함수 반환값
    """

    detached_df = origin_df.copy()

    for row in detached_df.iterrows():
        date = row[1]['date']
        detached_df.at[row[0], 'date'] = date[-2:]

    return detached_df

In [5]:
def rename_kor_to_eng(origin_df: pd.DataFrame) -> pd.DataFrame:
    """
    한글로 된 열 이름을 영어로 변환하는 함수
    origin_df: 원본 데이터프레임
    renamed_df: 원본 데이터프레임의 사본 (원본 데이터프레임 변형 방지 목적), 함수 반환값
    en_dictionary: 한영 변환할 때 문자열을 매칭시키기 위해 사용하는 딕셔너리
    en_categories: 영어로 변환된 열 이름을 정렬하기 위해 사용하는 리스트
    """

    renamed_df = origin_df.copy()

    en_dictionary = dict()
    en_dictionary['평균기온'] = 'temp'
    en_dictionary['강수량'] = 'rain'
    en_dictionary['평균풍속'] = 'wind'
    en_dictionary['최심신적설'] = 'snow'
    en_dictionary['평균전운량'] = 'cloud'

    renamed_df.rename(columns=en_dictionary, inplace=True)
    renamed_df.index.name = 'date'

    en_categories = list(en_dictionary.values())
    en_categories = [en_categories[0]] + sorted(en_categories[1:])
    renamed_df = renamed_df.reindex(columns=en_categories)

    return renamed_df

## 데이터 전처리

In [22]:
year = input()

In [23]:
original_weather = pd.read_csv(f'original_data/WEATHER_{year}.csv', encoding='utf-8')

In [24]:
weather = clean_up_data(original_weather)

In [25]:
weather_en = rename_kor_to_eng(weather)

In [26]:
weather_en.to_csv(f'weather_{year}.csv')
weather_en

Unnamed: 0_level_0,temp,cloud,rain,snow,wind
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01-01,-4.2,3.9,0.0,0.0,2.0
01-02,-5.0,0.0,0.0,0.0,2.6
01-03,-5.6,2.9,0.0,0.0,2.0
01-04,-3.5,4.8,0.0,0.0,1.7
01-05,-5.5,2.6,0.0,0.0,2.9
...,...,...,...,...,...
12-27,-7.6,3.1,0.0,0.0,1.7
12-28,-4.1,4.1,0.0,0.0,2.2
12-29,0.4,5.1,0.2,0.3,2.6
12-30,-3.9,2.0,0.0,0.0,3.3


## 테스트 코드

In [None]:
def score_weather(df_weather: pd.DataFrame) -> None:    
    for row in df_weather.iterrows():
        index = row[0]
        row = row[1]

        if row['최심신적설'] > 0.0:
            df_weather.at[index, '날씨기준'] = 4
        elif row['강수량'] >= 3.0:
            df_weather.at[index, '날씨기준'] = 3
        elif row['평균전운량'] >= 6.0:
            df_weather.at[index, '날씨기준'] = 2
        else:
            df_weather.at[index, '날씨기준'] = 1
    
    print('날씨기준이 설정되었습니다.')
        

In [None]:
score_weather(weather)

날씨기준이 설정되었습니다.


In [None]:
weather

Unnamed: 0_level_0,평균기온,강수량,평균풍속,최심신적설,평균전운량,날씨기준
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-05-01,16.4,0.0,2.3,0.0,1.9,1
2019-05-02,17.1,0.0,2.3,0.0,0.0,1
2019-05-03,17.9,0.0,1.6,0.0,0.0,1
2019-05-04,19.5,0.0,1.7,0.0,4.9,1
2019-05-05,19.2,0.0,2.2,0.0,5.1,1
...,...,...,...,...,...,...
2019-12-27,-1.7,0.0,2.1,0.0,0.0,1
2019-12-28,1.1,0.0,2.0,0.0,3.5,1
2019-12-29,3.8,1.4,2.9,0.0,9.0,2
2019-12-30,2.7,0.4,2.9,0.0,7.5,2


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=aeda0379-ef07-4599-92da-f5608bf4c48d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>