# AgriWeather.ipynb

- 데이터 중에서 농업기상에 관련된 데이터를 수집하고, 전처리하는 파일


In [1]:
import sys
import os
import time
from glob import glob
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.3f}'.format

# DC
import requests
from bs4 import BeautifulSoup

# about file format
import xmltodict


sys.path.append("../Import")
# 'auth' is included in gitignore.
from auth import authkey
from definitions import crop_list, crop_list2
authkey = authkey["FmlandWthrInfo_dec"]

root_path = "../"
data_path = f"{root_path}data/"

# FmlandWthrInfo

- 작물명  →  (지역코드(AREA_ID), 작물특성코드(PA_CROP_SPE_ID))  ⇒  FmlandWthrInfo

## 기상청_작물별 농업주산지 상세날씨 조회서비스

- 주의사항 : 

- Data Source : [기상청_작물별 농업주산지 상세날씨 조회서비스 - 일통계조회](https://www.data.go.kr/data/15059518/openapi.do)

- Collection Method : API_REST

- Data Format : JSON+XML

- 참고문서 : 작물별_농업주산지_상세날씨_조회서비스__지역코드_22.05.xlsx
    - 지역코드와 작물특성코드 목록

### 데이터 설명

- 농작물(36종) 주산지의 현재 날씨 및 동네예보 정보와 과거날씨 통계(일별, 순별, 월별) 정보 제공

### 요청변수(Request Parameter)

|항목명(국문)|항목명(영문)|항목크기|항목구분|샘플데이터|항목설명|
|---|---|---|---|---|---|
|서비스키|ServiceKey|4|필수|-|공공데이터포털에서 받은 인증키|
|페이지 번호|pageNo|4|필수|1|페이지번호|
|한 페이지 결과 수|numOfRows|4|필수|10|한 페이지 결과 수|
|응답자료형식|dataType|4|옵션|XML|요청자료형식(XML/JSON)|
|시작 연월일|ST_YMD|`0|필수|20161201|일통계 시작 날짜(YYYYMMDD)|
|종료 연월일|ED_YMD|10|필수|20161201|일통계 종료 날짜(YYYYMMDD)|
|지역 아이디|AREA_ID|10|필수|4122000000|지역 아이디(활용가이드 하단첨부 참고)|
|작물별 특성 아이디|PA_CROP_SPE_ID|10|필수|PA130201|주산지 작물별 특성 아이디(활용가이드 하단첨부 참고)|


### 출력결과(Response Element)

|항목명(국문)|항목명(영문)|항목크기|항목구분|샘플데이터|항목설명|
|---|---|---|---|---|---|
|결과코드|resultCode|2|필수|0|결과코드|
|결과메시지|resultMsg|50|필수|OK|결과메시지|
|한 페이지 결과 수|numOfRows|4|필수|10|한 페이지 결과 수|
|페이지 번호|pageNo|4|필수|1|페이지번호|
|전체 결과 수|totalCount|4|필수|3|전체 결과 수|
|데이터 타입|dataType|4|필수|XML|응답자료형식 (XML/JSON)|
|연원일|ymd|10|필수|20161201|일통계 날짜|
|지역 아이디|areaId|10|필수|4827000001|지역 아이디|
|지역 이름|areaName|50|필수|평택|지역 이름|
|작물 명|paCropName|20|필수|무|주산지 작물 이름|
|작물별 특성아이디|paCropSpeId|10|필수|PA020101|작물별 특성 아이디|
|작물별 특성 이름|paCropSpeName|20|필수|봄|주산지 작물별 특성 이름|
|일 평균기온|dayAvgTa|5|필수|24.5|일 평균기온|
|일 최고기온|dayMaxTa|5|필수|30|일 최고기온|
|일 최저기온|dayMinTa|5|필수|22|일 최저기온|
|일 평균상대습도|dayAvgRhm|5|필수|85|일 평균상대습도|
|일 최저상대습도|dayMinRhm|5|필수|60|일 최저상대습도|
|일 강수량|daySumRn|5|필수|25|일 강수량|
|일 평균풍속|dayAvgWs|5|필수|4|일 평균풍속|
|일 누적일조시간|daySumSs|5|필수|4|일 누적일조시간|
|특보 발효 여부|wmCount|4|필수|1|과거특보 발효여부(0:없음, 1:있음)|
|특보 코드|wmCd|8|필수|W2|특보 코드|


# 작물명을 넣으면 지역코드, 작물특성코드 반환하는 함수

- 작물명  →  (지역코드(AREA_ID), 작물특성코드(PA_CROP_SPE_ID))  ⇒  FmlandWthrInfo

## 문제(?)사항

1. 같은 작물에 복수의 작물_특성이 존재함.
    - 작물이 '호박'인 경우에 작물_특성으로 ['애호박', '쥬키니']가 있음.
2. 같은 작물 또는 같은 작물_특성에서 복수의 지역이 존재함.

- 위의 두 이유로 기상데이터를 어떻게 산출(계산?)하는 것이 예측에 좋을지 고민해보아야함. 

### 문제 해결 방안

- 문제1

    1. 모든 작물_특성을 사용.


- 문제2

    1. 모든 지역의 기상 데이터를 평균으로 계산
    2. 모든 지역 중 생산량 상위 n개 또는 n%를 평균 또는 가중치를 부여하여 계산


### 우리팀의 이번 프로젝트에서 선택한 방안과 그 이유

- 선택한 방안

- 선택한 이유

## 지역코드 목록

In [2]:
# 작물별_농업주산지_상세날씨_조회서비스__지역코드_22.05

df_code_raw = pd.read_excel(f"{data_path}작물별_농업주산지_상세날씨_조회서비스__지역코드_22.05.xlsx")
print(df_code_raw.info())
display(df_code_raw.describe())
display(df_code_raw.describe(include="object"))
display(df_code_raw.sample(10))

display(df_code_raw["주산지_작물명"].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681 entries, 0 to 680
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   지역_아이디         681 non-null    int64 
 1   지역_이름          681 non-null    object
 2   주산지_작물_특성_아이디  681 non-null    object
 3   주산지_작물_특성_이름   681 non-null    object
 4   주산지_작물명        681 non-null    object
dtypes: int64(1), object(4)
memory usage: 26.7+ KB
None


Unnamed: 0,지역_아이디
count,681.0
mean,4131752619.677
std,775162282.803
min,2611000000.0
25%,4146100000.0
50%,4415000000.0
75%,4688000000.0
max,5013000000.0


Unnamed: 0,지역_이름,주산지_작물_특성_아이디,주산지_작물_특성_이름,주산지_작물명
count,681,681,681,681
unique,152,62,31,36
top,동구,PA200101,-,배추
freq,16,36,237,63


Unnamed: 0,지역_아이디,지역_이름,주산지_작물_특성_아이디,주산지_작물_특성_이름,주산지_작물명
566,2671000000,기장군,PA080301,가을,당근
5,4282000000,고성,PA360101,애호박,호박
450,4617000000,나주,PA130401,가을,무
152,2726000000,수성구,PA240101,봄,양배추
209,4136000000,남양주,PA220101,-,시금치
361,2626000000,동래구,PA150101,-,방울토마토
277,2632000000,북구,PA170201,봄,배추
613,4477000000,서천,PA050101,-,깻잎
227,4277000000,정선,PA200101,-,사과
585,4673000000,구례,PA070101,-,단감


array(['호박', '풋고추', '포도', '팥', '콩', '참다래', '참깨', '쪽파', '오이', '열무',
       '얼갈이배추', '양파', '양배추', '쌀', '시금치', '생강', '사과', '붉은고추', '복숭아', '배추',
       '배', '방울토마토', '미나리', '무', '마늘', '땅콩', '들깨', '대파', '당근', '단감', '녹두',
       '깻잎', '고구마', '건고추', '감자', '감귤'], dtype=object)

### 탐색

In [3]:
# 컬럼'지역_아이디'가 int형식으로 되어있는데, 자리수가 안맞는 행이 있는지 확인해야함.
# 같으면 글자 수 맞출필요 없음.

df = df_code_raw.copy()

print(
    len(df["지역_아이디"].unique())
    == len(df[df["지역_아이디"].astype(str).str.len() == 10]["지역_아이디"].unique())
)

True


In [4]:
# 한 품목에 대한 id 탐색
print(df[df["주산지_작물명"] == "양배추"]["지역_아이디"].unique())
print()
print(df[df["주산지_작물명"].isin(["양배추"])]["지역_아이디"].unique())
print()
print(df[df["주산지_작물명"] == "양배추"]["주산지_작물_특성_아이디"].unique())

[2711000000 2714000000 2717000000 2720000000 2723000000 2726000000
 2729000000 2771000000 4272000000 4276000000 4277000000 4421000000
 4677000000 4680000000 4682000000 4684000000 4690000000 4719000000
 4775000000 4776000000 4825000000 4827000000 5011000000 5013000000]

[2711000000 2714000000 2717000000 2720000000 2723000000 2726000000
 2729000000 2771000000 4272000000 4276000000 4277000000 4421000000
 4677000000 4680000000 4682000000 4684000000 4690000000 4719000000
 4775000000 4776000000 4825000000 4827000000 5011000000 5013000000]

['PA240101' 'PA240201' 'PA240301' 'PA240401']


### df_code_filtered : crop_list에 있는 품목만 데이터프레임으로 만들기

In [6]:
df_code_filtered = df_code_raw.copy()

df_code_filtered = df[df["주산지_작물명"].isin(crop_list)][["지역_아이디", "주산지_작물_특성_아이디", "주산지_작물명"]]
display(df_code_filtered)

Unnamed: 0,지역_아이디,주산지_작물_특성_아이디,주산지_작물명
0,4211000000,PA360101,호박
1,4211000000,PA360201,호박
2,4272000000,PA360101,호박
3,4272000000,PA360201,호박
4,4278000000,PA360201,호박
...,...,...,...
673,4827000000,PA020301,감자
674,4887000000,PA020101,감자
675,4888000000,PA020101,감자
676,5011000000,PA020301,감자


In [7]:
# FmlandWthrInfo_code.csv
file_name = "FmlandWthrInfo_code.csv"
df_code_filtered.to_csv(f"{data_path}{file_name}", index=False)
print(file_name in os.listdir(data_path))

True


### fnc : 주산지_작물명 -> (지역_아이디, 주산지_작물_특성_아이디)

In [8]:
def get_code_list(crop_name, file_path=f"../data/FmlandWthrInfo_code.csv"):
    df = pd.read_csv(file_path, index_col=False)
    df = df[df["주산지_작물명"] == crop_name].drop_duplicates(subset=["지역_아이디"])
    v = []
    for idx in df.itertuples():
        v.append(idx[1:3])
    return v
get_code_list("마늘")

[(4215000000, 'PA120101'),
 (4217051000, 'PA120101'),
 (4221051000, 'PA120101'),
 (4223000000, 'PA120101'),
 (4282000000, 'PA120101'),
 (4283032000, 'PA120101'),
 (4380000000, 'PA120101'),
 (4421000000, 'PA120101'),
 (4427000000, 'PA120201'),
 (4481000000, 'PA120301'),
 (4482500000, 'PA120101'),
 (4677000000, 'PA120301'),
 (4682000000, 'PA120301'),
 (4684000000, 'PA120301'),
 (4686000000, 'PA120301'),
 (4689000000, 'PA120301'),
 (4691000000, 'PA120301'),
 (4723000000, 'PA120201'),
 (4772000000, 'PA120101'),
 (4773000000, 'PA120101'),
 (4783000000, 'PA120201'),
 (4824000000, 'PA120301'),
 (4874000000, 'PA120201'),
 (4884000000, 'PA120301'),
 (4889000000, 'PA120201'),
 (5011000000, 'PA120301'),
 (5013000000, 'PA120301')]

# 지역코드 → 농업기상정보

- ? → <u>**지역코드 → 농업기상정보**</u>


## API test

In [9]:
url = "http://apis.data.go.kr/1360000/FmlandWthrInfoService/getDayStatistics"
params = {
    "serviceKey": authkey,
    "pageNo": "1",
    "numOfRows": "10",
    "dataType": "XML",
    "ST_YMD": "20161201",
    "ED_YMD": "20161201",
    "AREA_ID": "4122000000",
    "PA_CROP_SPE_ID": "PA130201",
}

time.sleep(0.2)
response = requests.get(url, params=params)
if response.status_code == 200:
    soup = BeautifulSoup(response.text.encode("utf-8"), "xml")
    soup = soup.find_all("item")[0]

    # soup to dict
    parsed_dict = xmltodict.parse(str(soup))

    # dict to pd.df, check pd.df
    display(pd.DataFrame(parsed_dict).T.reset_index(drop=True))

    # check columns
    print(pd.DataFrame(parsed_dict).T.columns.to_list())

    columns = list(parsed_dict["item"].keys())
    print(columns)

    df_bl = {}
    for i in columns:
        df_bl[i] = []
    print(df_bl)


    # set rename_columns, rename_dict
    rename_columns = [
        "Area_ID",
        "Area_Name",
        "DayAvg_RelativeHumidity",
        "DayAvg_Temperature",
        "dayAvg_WindSpeed",
        "DayMax_Temperature",
        "DayMin_RelativeHumidity",
        "DayMin_Temperature",
        "DaySum_Rainfall",
        "daySum_Sunshine",
        "Crop_Name",
        "CropSpecific_ID",
        "CropSpecific_Name",
        "Warning_Code",
        "Warning_Count",
        "YearMonthDay",
    ]
    rename_dict = {}
    for i, j in zip(columns, rename_columns):
        rename_dict[i] = j
    pprint(rename_dict)

Unnamed: 0,areaId,areaName,dayAvgRhm,dayAvgTa,dayAvgWs,dayMaxTa,dayMinRhm,dayMinTa,daySumRn,daySumSs,paCropName,paCropSpeId,paCropSpeName,wrnCd,wrnCount,ymd
0,4122000000,평택,59,5,3,9,19,0,0,0,무,PA130201,봄,,0,2016-12-01 00:00:00


['areaId', 'areaName', 'dayAvgRhm', 'dayAvgTa', 'dayAvgWs', 'dayMaxTa', 'dayMinRhm', 'dayMinTa', 'daySumRn', 'daySumSs', 'paCropName', 'paCropSpeId', 'paCropSpeName', 'wrnCd', 'wrnCount', 'ymd']
['areaId', 'areaName', 'dayAvgRhm', 'dayAvgTa', 'dayAvgWs', 'dayMaxTa', 'dayMinRhm', 'dayMinTa', 'daySumRn', 'daySumSs', 'paCropName', 'paCropSpeId', 'paCropSpeName', 'wrnCd', 'wrnCount', 'ymd']
{'areaId': [], 'areaName': [], 'dayAvgRhm': [], 'dayAvgTa': [], 'dayAvgWs': [], 'dayMaxTa': [], 'dayMinRhm': [], 'dayMinTa': [], 'daySumRn': [], 'daySumSs': [], 'paCropName': [], 'paCropSpeId': [], 'paCropSpeName': [], 'wrnCd': [], 'wrnCount': [], 'ymd': []}
{'areaId': 'Area_ID',
 'areaName': 'Area_Name',
 'dayAvgRhm': 'DayAvg_RelativeHumidity',
 'dayAvgTa': 'DayAvg_Temperature',
 'dayAvgWs': 'dayAvg_WindSpeed',
 'dayMaxTa': 'DayMax_Temperature',
 'dayMinRhm': 'DayMin_RelativeHumidity',
 'dayMinTa': 'DayMin_Temperature',
 'daySumRn': 'DaySum_Rainfall',
 'daySumSs': 'daySum_Sunshine',
 'paCropName': 'Cro

## Fnc

Get_AgriWeather_Page

Get_AgriWeather_All

### Get_AgriWeather_Page

In [10]:
def Get_AgriWeather_Page(
    authkey: str,
    pageNo: int | str,
    search_Year: int,
    AREA_ID: str,
    PA_CROP_SPE_ID: str,
) -> pd.DataFrame | int:

    """
    API에서 농업기상정보를 가져오는 함수
    한 페이지의 정보를 가져옴.
    """

    url = "http://apis.data.go.kr/1360000/FmlandWthrInfoService/getDayStatistics"
    params = {
        "serviceKey": authkey,
        "pageNo": pageNo,
        "numOfRows": 100,
        "dataType": "XML",
        "ST_YMD": f"{search_Year}0101",
        "ED_YMD": f"{search_Year}1231",
        "AREA_ID": AREA_ID,
        "PA_CROP_SPE_ID": PA_CROP_SPE_ID,
    }

    # API
    time.sleep(0.01)
    response = requests.get(url, params)

    # # 빈 pd.df만들기
    # df = pd.DataFrame()

    # # 빈 pd.df만들기 - 칼럼 선언
    df = pd.DataFrame(
        {
            "ymd": [],
            "areaId": [],
            "areaName": [],
            "paCropName": [],
            "paCropSpeId": [],
            "paCropSpeName": [],
            "dayAvgTa": [],
            "dayMaxTa": [],
            "dayMinTa": [],
            "dayAvgRhm": [],
            "dayMinRhm": [],
            "daySumRn": [],
            "dayAvgWs": [],
            "daySumSs": [],
            "wrnCount": [],
            "wrnCd": [],
        }
    )

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "xml")
        soup = soup.find_all("item")
        try:
            s = soup[-1]

        except:
            return None

        for i in range(0, 100):
            try:
                s = soup[i]

                # col 직접 선언 (최적화를 위해)
                col = [
                    "ymd",
                    "areaId",
                    "areaName",
                    "paCropName",
                    "paCropSpeId",
                    "paCropSpeName",
                    "dayAvgTa",
                    "dayMaxTa",
                    "dayMinTa",
                    "dayAvgRhm",
                    "dayMinRhm",
                    "daySumRn",
                    "dayAvgWs",
                    "daySumSs",
                    "wrnCount",
                    "wrnCd",
                ]
                temp_dict = {"temp_idx": {}}

                for j in col:
                    try:
                        v = s.find(j).get_text()
                        temp_dict["temp_idx"][j] = v
                    except:
                        pass

            except:
                continue

            df_new = pd.DataFrame(temp_dict).T
            df = pd.concat([df, df_new]).reset_index(drop=True)

        return df[col]

    else:
        print(response.status_code)
        return response.status_code


# # fnc test
# temp = Get_AgriWeather_Page(
#     authkey=authkey,
#     pageNo=4,
#     search_Year=2016,
#     AREA_ID="4122000000",
#     PA_CROP_SPE_ID="PA130201",
# )
# display(temp)

### Get_AgriWeather_All

In [11]:
def Get_AgriWeather_All(
    authkey: str,
    search_Year_Period: tuple,
    crop_name: str,
    code_file_path=f"../data/FmlandWthrInfo_code.csv",
) -> pd.DataFrame:

    """
    'Get_AgriWeather_Page' 함수를 활용하여, 농업기상정보를 가져오는 함수
    """

    code_list = get_code_list(crop_name=crop_name, file_path=code_file_path)

    df = pd.DataFrame()

    for code in code_list:
        for y in range(search_Year_Period[0], search_Year_Period[1] + 1):
            for p in range(1, 5):
                df_new = Get_AgriWeather_Page(
                    authkey=authkey,
                    pageNo=p,
                    search_Year=y,
                    AREA_ID=code[0],
                    PA_CROP_SPE_ID=code[1],
                )
                df = pd.concat([df, df_new])
    return df


# # test
# temp = Get_AgriWeather_All(authkey=authkey, search_Year_Period=(2018, 2018), crop_name="마늘")
# display(temp)

## API로 데이터 불러와서 CSV로 저장하기

In [22]:
file_path = f"{data_path}AgriWeather/"
for crop in crop_list2:
    print(crop)
    for y in range(2005, 2021):
        print(y)
        df = Get_AgriWeather_All(authkey=authkey, search_Year_Period=(y, y), crop_name=crop)
        df.to_csv(f"{file_path}AgriWeather_{crop}_{y}_raw.csv", index=False)

마늘
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
깻잎
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
감자
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
고구마
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [23]:
for crop in crop_list2:
    files = glob(f"{file_path}/AgriWeather_{crop}*")
    print(files)

['../data/AgriWeather\\AgriWeather_마늘_2005_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2006_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2007_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2008_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2009_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2010_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2011_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2012_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2013_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2014_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2015_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2016_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2017_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2018_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2019_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2020_raw.csv']
['../data/AgriWeather\\AgriWeather_깻잎_2005_raw.csv', '../data/AgriWeather\\AgriWeather_깻잎_2006_raw.csv', '../data/AgriWeather\\AgriWeather_깻잎_2007_raw.csv', '../data/A

# AgriWeather 전처리

## 저장한 csv파일 목록 불러오고 DF로 만들고 병합하기.

In [24]:
file_path = f"{data_path}AgriWeather/"

df_raw = pd.DataFrame()
for crop in crop_list2:
    files = glob(f"{file_path}/AgriWeather_{crop}*")
    print(files)
    for file in files:
        df_new = pd.read_csv(f'{file}', index_col=False)
        df_raw = pd.concat([df_raw, df_new])
        # 필요없는 파일 제거
        os.remove(file)

df_raw.to_csv(f"{file_path}AgriWeather_raw.csv")
display(df_raw)

['../data/AgriWeather\\AgriWeather_마늘_2005_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2006_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2007_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2008_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2009_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2010_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2011_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2012_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2013_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2014_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2015_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2016_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2017_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2018_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2019_raw.csv', '../data/AgriWeather\\AgriWeather_마늘_2020_raw.csv']
['../data/AgriWeather\\AgriWeather_깻잎_2005_raw.csv', '../data/AgriWeather\\AgriWeather_깻잎_2006_raw.csv', '../data/AgriWeather\\AgriWeather_깻잎_2007_raw.csv', '../data/A

Unnamed: 0,ymd,areaId,areaName,paCropName,paCropSpeId,paCropSpeName,dayAvgTa,dayMaxTa,dayMinTa,dayAvgRhm,dayMinRhm,daySumRn,dayAvgWs,daySumSs,wrnCount,wrnCd
0,2005-01-01 00:00:00,4482500000,태안,마늘,PA120101,한지형,-1,1,-4,0,0,0,1,0,0,
1,2005-01-02 00:00:00,4482500000,태안,마늘,PA120101,한지형,2,5,-1,0,0,0,2,0,0,
2,2005-01-03 00:00:00,4482500000,태안,마늘,PA120101,한지형,4,7,0,0,0,0,4,0,0,
3,2005-01-04 00:00:00,4482500000,태안,마늘,PA120101,한지형,-2,0,-4,0,0,0,3,0,0,
4,2005-01-05 00:00:00,4482500000,태안,마늘,PA120101,한지형,-2,0,-7,0,0,0,1,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6436,2020-12-27 00:00:00,4888000000,거창,고구마,PA040101,밤,0,4,-3,90,61,3,0,0,0,
6437,2020-12-28 00:00:00,4888000000,거창,고구마,PA040101,밤,1,11,-1,93,57,0,0,4,0,
6438,2020-12-29 00:00:00,4888000000,거창,고구마,PA040101,밤,2,11,-2,84,50,0,1,2,0,
6439,2020-12-30 00:00:00,4888000000,거창,고구마,PA040101,밤,-6,0,-10,51,37,0,4,8,0,


## 전처리

In [35]:
df = df_raw.copy()

# 년월일에 시분초 제거
df["ymd"] = df["ymd"].str.replace(" 00:00:00", "")

# rename_col
rename_dict = {
    "ymd": "YMD",
    "areaId": "Area_ID",
    "areaName": "Area_Name",
    "paCropName": "Crop_Name",
    "paCropSpeId": "CropSpecific_ID",
    "paCropSpeName": "CropSpecific_Name",
    "dayAvgTa": "DayAvg_Temperature",
    "dayMaxTa": "DayMax_Temperature",
    "dayMinTa": "DayMin_Temperature",
    "dayAvgRhm": "DayAvg_RelativeHumidity",
    "dayMinRhm": "DayMin_RelativeHumidity",
    "daySumRn": "DaySum_Rainfall",
    "dayAvgWs": "DayAvg_WindSpeed",
    "daySumSs": "DaySum_Sunshine",
    "wrnCount": "Warning_Count",
    "wrnCd": "Warning_Code",
}
df = df[list(rename_dict.keys())].rename(columns=rename_dict)


# reorder_col
col_list = [
    "YMD",
    "Crop_Name",
    "DayAvg_Temperature",
    "DayMax_Temperature",
    "DayMin_Temperature",
    "DayAvg_RelativeHumidity",
    "DayMin_RelativeHumidity",
    "DaySum_Rainfall",
    "DayAvg_WindSpeed",
    "DaySum_Sunshine",
    "Warning_Count",
    "Warning_Code",
]
df = df[col_list]


# 데이터 타입 선언
dict = {
    "DayAvg_Temperature": "float",
    "DayMax_Temperature": "float",
    "DayMin_Temperature": "float",
    "DayAvg_RelativeHumidity": "float",
    "DayMin_RelativeHumidity": "float",
    "DaySum_Rainfall": "float",
    "DayAvg_WindSpeed": "float",
    "DaySum_Sunshine": "float",
    "Warning_Count": "int",
}
df = df.astype(dict)


# 파생 컬럼
df["DayDiff_Temperature"] = (
    df["DayMax_Temperature"] - df["DayMin_Temperature"]
)


# 반환할 컬럼
col_list = [
    "YMD",
    "Crop_Name",
    "DayAvg_Temperature",
    "DayDiff_Temperature",
    # "DayMax_Temperature",
    # "DayMin_Temperature",
    "DayAvg_RelativeHumidity",
    # "DayMin_RelativeHumidity",
    "DaySum_Rainfall",
    "DayAvg_WindSpeed",
    "DaySum_Sunshine",
    "Warning_Count",
    # "Warning_Code",
]


# 모든 지역 평균
df = df.groupby(["YMD","Crop_Name"])[col_list].mean()

display(df.reset_index())

Unnamed: 0,YMD,Crop_Name,DayAvg_Temperature,DayDiff_Temperature,DayAvg_RelativeHumidity,DaySum_Rainfall,DayAvg_WindSpeed,DaySum_Sunshine,Warning_Count
0,2005-01-01,감자,-3.750,9.250,44.000,0.750,2.250,3.500,0.000
1,2005-01-01,고구마,-3.000,8.000,0.000,0.000,2.000,0.000,0.000
2,2005-01-01,깻잎,-3.500,8.000,29.000,0.000,1.500,4.000,0.000
3,2005-01-01,마늘,-1.200,7.200,31.400,1.400,2.200,2.800,0.000
4,2005-01-02,감자,-0.250,10.750,48.250,0.000,1.750,0.500,0.000
...,...,...,...,...,...,...,...,...,...
23363,2020-12-30,마늘,-4.057,7.229,60.143,0.714,4.057,1.857,0.857
23364,2020-12-31,감자,-6.077,9.731,62.192,0.308,1.808,3.692,0.615
23365,2020-12-31,고구마,-5.222,9.222,66.944,0.333,1.667,3.611,0.556
23366,2020-12-31,깻잎,-5.154,10.077,55.385,0.000,2.231,5.923,0.154


## 전처리 끝낸 데이터프레임을 CSV로 저장하기.

In [37]:
file_path = f"{data_path}AgriWeather/"
df.to_csv(f"{file_path}AgriWeather.csv")
# 필요없는 파일 제거
os.remove(f"{file_path}AgriWeather_raw.csv")