# Pandas
> 데이터 과학자를 위해 테이블형태로 데이터를 다룰 수 있게 해주는 패키지 (python용 엑셀)  
기존 데이터처리 라이브러리인 numpy 대신 주로 사용  
일반인이 데이터분석을 접하기 쉽게 만들어준 결정적인 라이브러리  
pandas만으로도 충분히 데이터 분석이 가능할 정도로 고수준의 함수들을 내장  
앞으로 진행하는 데이터분석 과정에서 주로 사용하게 될 데이터구조  

In [None]:
# 패키지, 라이브러리, 모듈
# 불러들여 쓸 수 있는 특정 기능을 위한 프로그램

## pandas 설치
> 콘솔창에서 실행 시  
**pip install pandas**
    
> 주피터 노트북으로 실행 시  
 **!pip install pandas**
    
> 아나콘다 환경으로 python 환경설정 시 기본적으로 설치가 되어있음

In [2]:
!say 안녕하세요 시리입니다.

In [1]:
# pandas 설치
!pip install pandas



In [79]:
# 필요 모듈 import
import pandas as pd # 판다스 패키지를 불러들여서 pd라는 약자(닉네임)로 사용하겠다.

In [None]:
# 오픈소스 프로그래밍 환경
# 남이 만든 코드 -> 자유롭게 가져와서 쓸 수 있습니다.
# 내가 만든 코드 -> 배포도 자유롭습니다.

In [None]:
# 이런것도 할 수 있어요~
# 텍스트를 음성으로 변환시켜주는 패키지입니다.
# 사용하기 전 패키지를 다운받는 과정이 필요합니다.
# !pip install pyttsx3
import pyttsx3
engine = pyttsx3.init()
engine.say("수강생여러분.")
engine.say("파이썬 공부하느라 고생하십니다.")
engine.say("파이썬으로 이런것도 가능해요")
engine.say("하지만 이해못해도 괜챦아요.")
engine.say("왜냐하면 우리는 가져다 쓸꺼니까요. 찡긋")
engine.runAndWait()

In [9]:
# 질문주셨던 컴퓨터의 시간을 받아와서 사용하는 방법
from datetime import datetime
datetime.now()

datetime.datetime(2023, 9, 19, 16, 32, 12, 529491)

## DataFrame 데이터 불러오기
> 엑셀에 익숙한 사용자를 위해 제작 된 테이블형태의 데이터 구조  
다양한 형태의 데이터를 받아 사용할 수 있으며 다양한 통계, 시각화 함수를 제공한다.  
실제 데이터를 불러들이고 값을 확인 해 보며 기본적인 pandas 사용법을 익혀보도록 하겠습니다.

### 데이터 불러오기
pandas는 다양한 데이터 파일 형태를 지원하며 주로 csv, xlsx, sql을 사용한다.
    
> **`read_csv()`**  
**`read_excel()`**  
**`read_sql()`**  
**`read_json()`**  

### csv 파일 로딩

In [10]:
import pandas as pd

In [13]:
pwd

'/Users/byun/0_lecture/06_samsung_preprocessing/2023_preprocessing_17'

In [80]:
# DataFrame 의 약자로서 형식적으로 df 변수명을 사용한다.
# pandas패키지의 read_csv() 함수를 사용하여 energy1.csv 파일을 불러들여 
# 데이터프레임을 만들고 df 이름의 변수로 저장
df = pd.read_csv('./data/energy1.csv', encoding='cp949')
# 입력값은 경로를 포함한 파일명을 문자열로 전달
# 파이썬은 인코딩 (파일을 저장 읽어들이는 방식) 'utf-8', 'cp949' --> 윈도우
# 'utf-16-sig', 'utf-32-sig' --> 제조사 콜 --> 개발자 콜

In [16]:
df

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0


### excel 파일 로딩

In [17]:
# 만약 모듈을 찾을 수 없는 오류가 발생한다면 추가 모듈 설치
pd.read_excel('./data/energy1.xlsx')
# sheet_name=시트명 문자열로 전달
# engine=과거 엑셀파일 형식
# header=데이터가 시작되는 인덱스 전달

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0,0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0,0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0,0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0,0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1,1
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1,1
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1,1
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1,1


### json 파일 로딩

In [18]:
pd.read_json('./data/005930.json')

Unnamed: 0,symbolCode,date,tradePrice,tradeTime,change,changePrice,changeRate,prevClosingPrice,exchangeCountry,openingPrice,highPrice,lowPrice,accTradePrice,accTradeVolume,periodTradePrice,periodTradeVolume,listedSharesCount
0,A005930,2023-02-03 15:30:19,63800,15:30:19,RISE,300,0.004724,63500,KOREA,63900,64000,63000,942509594000,14804617,942509594000,14804617,
1,A005930,2023-02-02 15:30:07,63500,15:30:07,RISE,1700,0.027508,61800,KOREA,63200,63900,62600,1474629229812,23285983,1474629229812,23285983,
2,A005930,2023-02-01 15:30:23,61800,15:30:23,RISE,800,0.013115,61000,KOREA,62600,62700,61000,1145781815984,18570133,1145781815984,18570133,
3,A005930,2023-01-31 15:30:07,61000,15:30:07,FALL,2300,-0.036335,63300,KOREA,63500,63700,61000,1835768640685,29746731,1835768640685,29746731,
4,A005930,2023-01-30 15:30:19,63300,15:30:19,FALL,1300,-0.020124,64600,KOREA,64900,64900,63100,1337025734920,20995234,1337025734920,20995234,
5,A005930,2023-01-27 15:30:08,64600,15:30:08,RISE,700,0.010955,63900,KOREA,64400,65000,63900,1212764792491,18760182,1212764792491,18760182,
6,A005930,2023-01-26 15:30:14,63900,15:30:14,RISE,500,0.007886,63400,KOREA,63800,63900,63300,846408637700,13278277,846408637700,13278277,
7,A005930,2023-01-25 15:30:24,63400,15:30:24,RISE,1600,0.02589,61800,KOREA,63500,63700,63000,1066200962700,16822710,1066200962700,16822710,
8,A005930,2023-01-20 15:30:22,61800,15:30:22,RISE,300,0.004878,61500,KOREA,62100,62300,61100,595372614900,9646327,595372614900,9646327,
9,A005930,2023-01-19 15:30:25,61500,15:30:25,RISE,1100,0.018212,60400,KOREA,60500,61500,60400,781937546636,12808490,781937546636,12808490,


### API를 활용하여 웹에서 수집한 데이터 로딩

In [19]:
import requests # 컴퓨터간 통신에 사용하는 프로토콜을 파이썬으로 제어
import json # json을 파이썬으로 변환
url = 'https://finance.daum.net/api/quote/A005930/days?symbolCode=A005930&page=1&perPage=10&pagination=true'
info = {
    'referer': 'https://finance.daum.net/quotes/A005930?period=day',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}

resp = requests.get(url, headers=info)
data = json.loads(resp.text)
pd.DataFrame(data['data'])

Unnamed: 0,symbolCode,date,tradePrice,tradeTime,change,changePrice,changeRate,prevClosingPrice,exchangeCountry,openingPrice,highPrice,lowPrice,accTradePrice,accTradeVolume,periodTradePrice,periodTradeVolume,listedSharesCount
0,A005930,2023-09-19 15:30:22,69800.0,15:30:22,FALL,400.0,-0.005698,70200.0,KOREA,70400.0,70800.0,69600.0,818246870200,11679486,818246870200,11679486,
1,A005930,2023-09-18 15:30:11,70200.0,15:30:11,FALL,1800.0,-0.025,72000.0,KOREA,71300.0,71700.0,70200.0,1134188613800,16040727,1134188613800,16040727,
2,A005930,2023-09-15 15:30:27,72000.0,15:30:27,RISE,300.0,0.004184,71700.0,KOREA,71700.0,72300.0,71400.0,1283667032959,17823512,1283667032959,17823512,
3,A005930,2023-09-14 15:30:27,71700.0,15:30:27,RISE,800.0,0.011283,70900.0,KOREA,71200.0,71800.0,70800.0,1504488795100,21041407,1504488795100,21041407,
4,A005930,2023-09-13 15:30:03,70900.0,15:30:03,RISE,400.0,0.005674,70500.0,KOREA,71100.0,71600.0,70300.0,1131860397902,15955797,1131860397902,15955797,
5,A005930,2023-09-12 15:30:14,70500.0,15:30:14,FALL,300.0,-0.004237,70800.0,KOREA,70800.0,71000.0,70400.0,825295634500,11688599,825295634500,11688599,
6,A005930,2023-09-11 15:30:11,70800.0,15:30:11,RISE,500.0,0.007112,70300.0,KOREA,70400.0,70800.0,70000.0,830034512852,11785462,830034512852,11785462,
7,A005930,2023-09-08 15:30:24,70300.0,15:30:24,FALL,100.0,-0.00142,70400.0,KOREA,70200.0,70300.0,69600.0,747653334264,10688118,747653334264,10688118,
8,A005930,2023-09-07 15:30:27,70400.0,15:30:27,RISE,400.0,0.005714,70000.0,KOREA,70000.0,70600.0,69600.0,964326666248,13741241,964326666248,13741241,
9,A005930,2023-09-06 15:30:09,70000.0,15:30:09,FALL,700.0,-0.009901,70700.0,KOREA,70700.0,70800.0,69700.0,799882288000,11414620,799882288000,11414620,


### 데이터베이스에서 쿼리를 사용한 데이터 로딩

In [None]:
# 참고! 실습은 하지 않습니다만 쿼리를 사용하여 데이터베이스로부터 데이터프레임을 만드는 것도 가능합니다.
# 데이터베이스로 부터 자료 읽기

# 필요한 모듈 추가 설치 - 각 데이터베이스 별로 다릅니다.
# !pip install oracle

# sql 모듈 로드하기
# import oracle
# mysql, mariadb, sqlite, postgresql, ms-sql, oracle, mongodb

# 접속하기
# 접속방법 또한 DB 종류에 따라 다릅니다.
# con = pymysql.connect(host='db서버주소', port=3306, user='id', passwd='pwd', db='dbname')

# query 만들기
# query = 'select * from samples'

# 자료 불러오기
# data = pd.read_sql(query, con=con)

### 암호화 엑셀 파일 로딩

In [None]:
# 나스카가 걸려있어도 엑셀파일 오픈(O)
# 엑셀매크로를 사용하여 엑셀프로그램을 오픈 암호화 걸린 데이터의 클립보드 복사 한 내용을 데이터프레임으로 저장

In [None]:
# pip uninstall xlwings # 파이썬으로 엑셀제어
# pip instal appscript # 파이썬으로 매크로
# pip install xlwings
import pandas as pd
import xlwings as xw

book = xw.Book('./data/energy1.xlsx') # 엑셀파일을 변수화 (엑셀프로그램 오픈)
sheet1 = book.sheets[0] # 시트번호로 선택
# sheet2 = book.sheet['시트명'] # 시트이름으로 선택
sheet1.used_range # 데이터입력 전 영역선택
sheet1.used_range.options(pd.DataFrame, index=False).value

# 로컬컴퓨터에 엑셀이 있어야 함.

## 데이터 저장하기
불러들인 혹은 작업을 마친 데이터프레임을 다양한 파일형태로 저장이 가능합니다.  
데이터분석 과정은 원본데이터를 되도록이면 유지하며 전처리를 진행하지만 주기적으로 백업은 진행하는 것이 좋습니다.  
> **`to_csv()`**  
**`to_excel()`**  

In [22]:
# 데이터 저장
df.to_csv('./data/save_test.csv', index=False, encoding='cp949')
# index=False 데이터를 저장하면서 인덱스를 초기화하는 옵션

In [23]:
# 엑셀은 오래걸립니다~
df.to_excel('./data/save_test.xlsx')

## 사용 데이터 간략 설명
> 한국에너지관리공단에서 제공한 전력사용량 데이터  
1시간 간격으로 수집 된 60개 건물들의 2020년 6월 1일 부터 2020년 8월 24일까지의 데이터  
건물정보와 기후정보를 활용한 전력사용량을 예측하기 위한 데이터셋  
대회홈페이지 : https://dacon.io/competitions/official/235736/overview/description
>> 각 변수(컬럼) 설명  
>>- num : 건물번호  
>>- date_time : 데이터가 수집 된 날짜, 시간  
>>- 전력사용량 : 수집 된 시점에 사용한 전력량  
>>- 비전기냉방설비운영 : 0-미운영, 1-운영  
>>- 태양광보유 : 0-미보유, 1-보유

In [24]:
df.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


## 데이터 살펴보기

In [25]:
# 데이터를 불러들인 후 가장 처음 하는 작업
# 데이터의 구조, 형태 파악하기
# 데이터의 첫 5개 샘플(row, 행, 샘플, 관측치, 인스턴스, 벡터) 확인하기
df.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [26]:
# 데이터의 마지막 5개 샘플 확인하기
df.tail()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0
122399,60,2020-08-24 23,3204.576,27.1,2.6,75.0,0.0,0.0,1.0,1.0


In [27]:
# 데이터의 갯수를 살펴봅니다
len(df)

122400

In [28]:
# 데이터 shape 확인, 행렬데이터의 사이즈, 행과 열 기준으로 데이터의 갯수 4 x 5
df.shape
# 속성값으로 확인하는 데이터의 모양

(122400, 10)

In [29]:
# 데이터의 전반적인 정보를 확인합니다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122400 entries, 0 to 122399
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   num         122400 non-null  int64  
 1   date_time   122400 non-null  object 
 2   전력사용량(kWh)  122400 non-null  float64
 3   기온(°C)      122400 non-null  float64
 4   풍속(m/s)     122400 non-null  float64
 5   습도(%)       122400 non-null  float64
 6   강수량(mm)     122400 non-null  float64
 7   일조(hr)      122400 non-null  float64
 8   비전기냉방설비운영   122400 non-null  float64
 9   태양광보유       122400 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 9.3+ MB


In [30]:
# 데이터의 기초통계량을 확인합니다.
df.describe()

Unnamed: 0,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
count,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0
mean,30.5,2324.830866,24.251713,2.151641,80.169848,0.514989,0.213533,0.683333,0.483333
std,17.318173,2058.999326,3.407902,1.514475,15.525862,2.624505,0.370517,0.465178,0.499724
min,1.0,0.0,11.1,0.0,19.0,0.0,0.0,0.0,0.0
25%,15.75,1055.268,21.8,1.1,70.0,0.0,0.0,0.0,0.0
50%,30.5,1700.352,24.2,1.9,84.0,0.0,0.0,1.0,0.0
75%,45.25,2780.487,26.5,2.9,93.0,0.0,0.3,1.0,1.0
max,60.0,17739.225,36.3,20.1,100.0,81.5,1.0,1.0,1.0


In [31]:
# 컬럼(변수명)
df.columns
df.shape

Index(['num', 'date_time', '전력사용량(kWh)', '기온(°C)', '풍속(m/s)', '습도(%)',
       '강수량(mm)', '일조(hr)', '비전기냉방설비운영', '태양광보유'],
      dtype='object')

데이터셋을 살펴 본 결과 시간별로 관측 된 기후데이터와 전력사용량을 확인 할 수 있었습니다.  
각각의 관측치(샘플)에 대한 정보를 유추하고, 건물별로 수집 된 데이터의 특징을 볼 수 있었습니다.  
데이터의 크기, 사이즈, 기초통계량을 바탕으로 조금 더 디테일하게 데이터를 살펴보겠습니다.

## 데이터접근 (인덱싱, 슬라이싱, 샘플링)

In [4]:
[1, 2, 3, 4]
'하세요안녕'

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [None]:
# 갑분싸 선형대수학
# 스칼라, 벡터, 매트릭스, 텐서
# (1, 1), (10, 1) 혹은 (1, 10), (4, 5), (4, 5, 3), (4, 4, 3, 4)

In [7]:
# 인덱싱
df.iloc[[1611]]
# 판다스에서 벡터데이터를 serise라는 자료구조로 따로 그분
# 행렬(매트릭스) --> 데이터프레임으로 표현

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
1611,1,2020-08-07 03,8470.656,22.7,2.0,92.0,0.0,0.0,0.0,0.0


In [10]:
# 첫 샘플 혹은 레코드, 인스턴스, 데이터포인트 에 대한 데이터를 살펴보겠습니다.
# 인덱스넘버로 데이터에 접근하는 .iloc[색인]
df.iloc[1611].values

array([1, '2020-08-07 03', 8470.655999999999, 22.7, 2.0, 92.0, 0.0, 0.0,
       0.0, 0.0], dtype=object)

In [11]:
# 10번 인덱스 부터 20번 인덱스 샘플
df.iloc[10:21]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
10,1,2020-06-01 10,8116.2,20.5,3.4,62.0,0.0,1.0,0.0,0.0
11,1,2020-06-01 11,8104.536,22.1,3.6,52.0,0.0,1.0,0.0,0.0
12,1,2020-06-01 12,8088.984,23.1,4.0,49.0,0.0,1.0,0.0,0.0
13,1,2020-06-01 13,8102.592,23.1,5.1,42.0,0.0,1.0,0.0,0.0
14,1,2020-06-01 14,8088.336,23.6,5.1,39.0,0.0,1.0,0.0,0.0
15,1,2020-06-01 15,8076.672,23.8,5.5,40.0,0.0,1.0,0.0,0.0
16,1,2020-06-01 16,8032.608,24.4,3.0,39.0,0.0,1.0,0.0,0.0
17,1,2020-06-01 17,8013.816,23.6,4.4,41.0,0.0,1.0,0.0,0.0
18,1,2020-06-01 18,8029.368,22.7,4.1,42.0,0.0,1.0,0.0,0.0
19,1,2020-06-01 19,8028.072,21.3,4.2,44.0,0.0,1.0,0.0,0.0


In [12]:
# 여러개의 관측치에 접근
df.iloc[[0, 10, 20]]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
10,1,2020-06-01 10,8116.2,20.5,3.4,62.0,0.0,1.0,0.0,0.0
20,1,2020-06-01 20,7994.376,19.8,2.5,51.0,0.0,0.4,0.0,0.0


In [14]:
# 변수 단위 샘플 접근
# row, 행, 샘플, 관측치, 인스턴스, 포인트
# columns, 열, 특징, 변수, feature
df['전력사용량(kWh)']

0         8179.056
1         8135.640
2         8107.128
3         8048.808
4         8043.624
            ...   
122395    4114.368
122396    3975.696
122397    3572.208
122398    3299.184
122399    3204.576
Name: 전력사용량(kWh), Length: 122400, dtype: float64

In [15]:
# 여러 컬럼 동시 접근
df[['date_time', '전력사용량(kWh)']]

Unnamed: 0,date_time,전력사용량(kWh)
0,2020-06-01 00,8179.056
1,2020-06-01 01,8135.640
2,2020-06-01 02,8107.128
3,2020-06-01 03,8048.808
4,2020-06-01 04,8043.624
...,...,...
122395,2020-08-24 19,4114.368
122396,2020-08-24 20,3975.696
122397,2020-08-24 21,3572.208
122398,2020-08-24 22,3299.184


In [18]:
# row와 columns을 동시에 슬라이싱 하는 속성
# df.loc[인덱스, 컬럼명]
# df.iloc[무조건 숫자형태만 전달 가능]
df.loc[10:20, '전력사용량(kWh)']
# df.loc[관측치, 변수 기준 슬라이싱 코드 전달]

10    8116.200
11    8104.536
12    8088.984
13    8102.592
14    8088.336
15    8076.672
16    8032.608
17    8013.816
18    8029.368
19    8028.072
20    7994.376
Name: 전력사용량(kWh), dtype: float64

## 팬시인덱싱 전달
기본적인 인덱싱 방법에 추가로 조건에 따른 데이터 샘플링도 가능합니다.  
넘파이의 bool 타입데이터를 인덱스로 전달받는 방법으로 조건에 다른 데이터를 선별한다면 조금 더 고차원적인 데이터 선택이 가능합니다.

In [21]:
# 조건식을 인덱스로 받는 팬시인덱싱
# 건물 번호 기준으로 한 건물에 해당하는 관측치 선별
df.loc[df['num'] == 1]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2035,1,2020-08-24 19,8714.952,29.4,3.4,66.0,0.0,0.2,0.0,0.0
2036,1,2020-08-24 20,8740.224,28.7,1.9,69.0,0.0,0.0,0.0,0.0
2037,1,2020-08-24 21,8730.504,28.3,1.1,71.0,0.0,0.0,0.0,0.0
2038,1,2020-08-24 22,8725.968,28.3,2.4,72.0,0.0,0.0,0.0,0.0


In [24]:
# 전력 사용량 기준으로 15000kWh 이상 사용한 관측치 
df.loc[df['전력사용량(kWh)'] >= 15000]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
14459,8,2020-06-08 11,15177.411,27.9,1.2,53.0,0.0,1.0,1.0,1.0
14464,8,2020-06-08 16,15152.454,31.0,1.5,40.0,0.0,1.0,1.0,1.0
14482,8,2020-06-09 10,15139.800,26.7,0.9,50.0,0.0,1.0,1.0,1.0
14483,8,2020-06-09 11,15417.936,28.7,1.8,45.0,0.0,1.0,1.0,1.0
14485,8,2020-06-09 13,15177.420,30.7,1.4,36.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
16309,8,2020-08-24 13,17423.793,31.5,2.0,56.0,0.0,1.0,1.0,1.0
16310,8,2020-08-24 14,17458.137,31.4,1.9,55.0,0.0,1.0,1.0,1.0
16311,8,2020-08-24 15,17230.671,32.1,2.1,56.0,0.0,1.0,1.0,1.0
16312,8,2020-08-24 16,17300.313,32.3,1.6,54.0,0.0,1.0,1.0,1.0


In [27]:
# 해당시간 데이터보다 나중에 발생한 관측치 선택 
df.loc[df['date_time'] > '2020-07-31 23']

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
1464,1,2020-08-01 00,8623.584,25.0,3.4,90.0,1.0,0.0,0.0,0.0
1465,1,2020-08-01 01,8591.184,24.6,0.5,90.0,0.2,0.0,0.0,0.0
1466,1,2020-08-01 02,8582.112,24.6,0.5,91.0,0.0,0.0,0.0,0.0
1467,1,2020-08-01 03,8593.776,24.7,1.2,93.0,0.0,0.0,0.0,0.0
1468,1,2020-08-01 04,8575.632,24.7,0.8,94.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0


In [34]:
# or 연산 | 
# and 연산 &
# 고온다습한 기후에서 측정 된 관측치를 선별 기온 30이상, 습도 80이상
df.loc[(df['기온(°C)'] >= 30) & (df['습도(%)'] >= 80)]

cond1 = (df['기온(°C)'] >= 30)
cond2 = (df['습도(%)'] >= 80)
df.loc[cond1 & cond2]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
18064,9,2020-08-12 16,1308.312,30.0,1.2,83.0,0.0,0.2,0.0,1.0
28238,14,2020-08-11 14,2634.660,30.4,4.9,81.0,0.3,0.3,1.0,1.0
32152,16,2020-08-04 16,1909.008,30.0,2.4,81.0,0.0,0.2,1.0,1.0
32172,16,2020-08-05 12,2010.420,30.4,4.0,81.0,0.0,0.8,1.0,1.0
32362,16,2020-08-13 10,2016.252,30.4,1.7,83.0,0.0,0.4,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
116051,57,2020-08-15 11,2235.168,30.1,3.1,81.0,0.0,0.9,1.0,0.0
116075,57,2020-08-16 11,2255.040,30.6,3.0,81.0,0.0,1.0,1.0,0.0
116082,57,2020-08-16 18,2365.632,30.1,3.2,82.0,0.0,0.5,1.0,0.0
116107,57,2020-08-17 19,2464.992,30.3,0.9,82.0,0.0,0.3,1.0,0.0


In [38]:
# or조건 건물번호가 10, 20번인 데이터를 선별
cond1 = (df['num'] == 10)
cond2 = (df['num'] == 20)
df.loc[cond1 | cond2]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
18360,10,2020-06-01 00,417.960,18.3,2.5,94.0,0.0,0.0,1.0,0.0
18361,10,2020-06-01 01,407.592,18.8,2.3,89.0,0.0,0.0,1.0,0.0
18362,10,2020-06-01 02,404.028,19.0,3.2,86.0,0.0,0.0,1.0,0.0
18363,10,2020-06-01 03,397.548,19.0,4.1,85.0,0.0,0.0,1.0,0.0
18364,10,2020-06-01 04,397.224,18.9,3.9,87.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
40795,20,2020-08-24 19,2590.596,29.4,3.4,66.0,0.0,0.2,0.0,0.0
40796,20,2020-08-24 20,2592.216,28.7,1.9,69.0,0.0,0.0,0.0,0.0
40797,20,2020-08-24 21,2477.034,28.3,1.1,71.0,0.0,0.0,0.0,0.0
40798,20,2020-08-24 22,2199.258,28.3,2.4,72.0,0.0,0.0,0.0,0.0


In [41]:
# 조건식이 아닌 특정 값 기준으로 데이터를 찾을 때
# 필터링 샘플링 하실 때 조건식보다 조금 더 편하실 수 있음
df.loc[df['num'].isin([1, 3, 5, 7, 9])]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
18355,9,2020-08-24 19,1321.920,28.2,0.6,77.0,0.0,0.3,0.0,1.0
18356,9,2020-08-24 20,1299.240,27.0,1.1,84.0,0.0,0.0,0.0,1.0
18357,9,2020-08-24 21,1286.280,26.8,0.8,85.0,0.0,0.0,0.0,1.0
18358,9,2020-08-24 22,1295.676,25.1,0.8,92.0,0.0,0.0,0.0,1.0


In [44]:
# contains로 전달하는 문자열이 포함되면 True
df.loc[df['date_time'].str.contains('08-01 08')]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
1472,1,2020-08-01 08,8576.28,24.6,1.2,96.0,0.1,0.0,0.0,0.0
3512,2,2020-08-01 08,1243.836,26.1,3.3,86.0,0.0,0.3,1.0,0.0
5552,3,2020-08-01 08,3854.3688,26.0,3.0,94.0,0.0,0.0,1.0,1.0
7592,4,2020-08-01 08,569.808,25.9,0.6,92.0,0.0,0.0,1.0,1.0
9632,5,2020-08-01 08,2511.216,26.1,3.3,86.0,0.0,0.3,1.0,0.0
11672,6,2020-08-01 08,691.74,26.0,3.0,94.0,0.0,0.0,0.0,0.0
13712,7,2020-08-01 08,962.361,24.6,1.2,96.0,0.1,0.0,1.0,0.0
15752,8,2020-08-01 08,5774.688,25.9,0.4,93.0,0.0,0.1,1.0,1.0
17792,9,2020-08-01 08,1247.508,24.5,0.6,100.0,0.0,0.0,0.0,1.0
19832,10,2020-08-01 08,877.716,26.1,3.3,86.0,0.0,0.3,1.0,0.0


In [46]:
# 랜덤한 샘플의 수 혹은 비율에 따른 선별 방법
df.sample(frac=0.05)
# 숫자를 전달하면 해당 갯수만큼 선별, frac=비율전달 전달하는 데이터 비율만큼 선별

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
116875,58,2020-06-25 19,958.2300,22.2,2.3,83.0,0.0,0.0,1.0,0.0
57106,28,2020-08-24 10,2999.1600,29.5,3.7,76.0,0.0,1.0,1.0,0.0
52180,26,2020-07-20 04,2287.9125,25.3,1.7,95.0,0.1,0.0,0.0,1.0
51790,26,2020-07-03 22,2304.2880,18.6,0.2,96.0,1.7,0.0,0.0,1.0
97386,48,2020-08-02 18,413.2260,23.0,3.1,96.0,0.7,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
3978,2,2020-08-20 18,1475.1720,29.9,2.3,63.0,0.0,1.0,1.0,0.0
108474,54,2020-06-15 18,6487.7760,25.4,2.8,38.0,0.0,1.0,1.0,0.0
3935,2,2020-08-18 23,1105.8120,27.8,2.7,81.0,0.0,0.0,1.0,0.0
114984,57,2020-07-02 00,2811.4560,20.9,0.1,87.0,0.0,0.0,1.0,0.0


## 집계값 계산

In [55]:
# 전력사용량 기준 집계값 계산
# 평균, 합, 최대값, 최소값, 평균, 분산
print(df['전력사용량(kWh)'].mean())
print(df['전력사용량(kWh)'].max())
print(df['전력사용량(kWh)'].min())
print(df['전력사용량(kWh)'].median())
print(df['전력사용량(kWh)'].std()) # 표준편차
print(df['전력사용량(kWh)'].var()) # 분산
print(df['전력사용량(kWh)'].argmax()) # 최대값이 있는 인덱스 출력
print(df['전력사용량(kWh)'].argmin()) # 최소값이 있는 인덱스 출력

2324.830865868444
17739.225
0.0
1700.352
2058.999325845112
4239478.2238306245
16166
54684


In [56]:
df.iloc[[54684]]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
54684,27,2020-08-08 12,0.0,23.0,2.1,88.0,0.3,0.0,1.0,1.0


In [60]:
# 연속형(실수형) 에 적용 시 의미있는 결과
# 컬럼 값 기준 가장 큰, 작은 값을 가진 샘플 확인 nlargest
df['전력사용량(kWh)'].nlargest(10) # 내림차순 정렬
# 컬럼 값 기준으로 가장 작은 값 nsmallest
df['전력사용량(kWh)'].nsmallest(10)

54684      0.000
54685      0.000
54686      0.000
54687      0.000
54688      0.000
55044     85.320
6343     138.240
55018    146.880
6463     152.496
6319     152.928
Name: 전력사용량(kWh), dtype: float64

In [66]:
# 범주형(카테고리컬)
print(df['비전기냉방설비운영'].unique()) # 중복값을 제외한 고윳값만을 출력
print(df['비전기냉방설비운영'].nunique()) # 중복값을 제외한 고윳값 갯수 출력
df['비전기냉방설비운영'].value_counts() # 변수의 고윳값을 기준으로 데이터의 갯수를 카운트하고 내림차순 정렬

[0. 1.]
2


1.0    83640
0.0    38760
Name: 비전기냉방설비운영, dtype: int64

## 데이터 재구조화
기존의 데이터 샘플링과는 달리 기준점으로 생각할 수 있는 컬럼 값을 기준으로 새롭게 데이터 프레임을 생성하며  
평균, 합, 카운트 등을 통해 데이터를 다차원적으로 분석 할 수 있는 함수를 제공합니다.
> 기준 변수(컬럼)가 한개 일 경우
>> **`df`**.**`groupby('컬럼명')`**.**`agg(집계방법)`**  

> 기준 변수(컬럼)가 두개 이상일 경우  
>>  **`pd`**.**`pivot_table(data=데이터프레임명, index=기준컬럼1, columns=기준컬럼2, values=집계데이터, aggfunc=집계방법)`**  

>> 적용가능한 통계 함수

| 함수명 | 내용 |
|-|-|
| count | 갯수 |
| sum | 합 |
| mean | 평균 |
| median | 중앙값 |
| var, std | 분산, 표준편차 |
| min, max | 최소값, 최대값 |
| unique, nunique | 고윳값, 고윳값 갯수 |
| prod | 곱 |

### groupby

In [74]:
df.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [75]:
# groupby - 적용데이터가 카테고리컬 데이터가 좋다
df.groupby('num', as_index=False).mean(numeric_only=True)
df.groupby('num', as_index=False).agg({'전력사용량(kWh)':['mean', 'max'],
                                       '기온(°C)':['mean', 'median']})

Unnamed: 0_level_0,num,전력사용량(kWh),전력사용량(kWh),기온(°C),기온(°C)
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,median
0,1,8543.273488,8827.704,24.608578,24.6
1,2,1299.914365,2165.292,23.623333,23.4
2,3,3371.353699,3957.4575,24.241275,24.3
3,4,763.997353,2385.504,25.143627,24.8
4,5,2714.509694,3890.16,23.623333,23.4
5,6,1184.129868,3028.32,24.241275,24.3
6,7,1527.138356,2833.461,24.608578,24.6
7,8,8837.364073,17739.225,24.124363,24.1
8,9,1256.88488,1352.592,24.243284,23.9
9,10,1427.164069,3186.864,23.623333,23.4


### pivot_table

In [76]:
df.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [78]:
# pivot_table 그룹화 시킬 데이터가 카테고리컬 데이터이면 좋음
# 그룹화 시켜야 할 변수가 여러개일 때 - 인덱스, 컬럼 위치시킴
pd.pivot_table(data=df,
               index='비전기냉방설비운영',
               columns='태양광보유',
               values='전력사용량(kWh)',
               aggfunc='mean')

태양광보유,0.0,1.0
비전기냉방설비운영,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2077.030506,1582.298192
1.0,2101.99359,2949.726621


## 데이터로 부터 의미있는 정보 추출하기

In [85]:
# 건물번호 10번의 전력사용량 평균
df.loc[df['num'] == 10, '전력사용량(kWh)'].mean()

1427.1640686279413

In [89]:
# 평균 전력사용량이 높은 건물번호 순서대로 5개 나열
df.groupby('num').mean(numeric_only=True).nlargest(5, '전력사용량(kWh)')

Unnamed: 0_level_0,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8,8837.364073,24.124363,1.305735,79.162255,0.488922,0.199167,1.0,1.0
1,8543.273488,24.608578,2.34598,75.971569,0.493873,0.184608,0.0,0.0
38,7588.679824,24.608578,2.34598,75.971569,0.493873,0.184608,1.0,1.0
54,6839.836376,24.241275,1.937402,82.667157,0.540882,0.219657,1.0,0.0
31,5964.317576,24.608578,2.34598,75.971569,0.493873,0.184608,1.0,0.0


## 데이터프레임 병합
> 실제 분석업무를 진행하다보면 데이터가 여기저기 분산되어 있을 경우가 더 많습니다.  
조각난 데이터를 분석에 필요한 데이터셋으로 만들기 위해 데이터프레임 병합을 많이 사용합니다.  
한개 이상의 데이터프레임을 병합 할 때 주로 사용하는 함수 2가지를 알아보겠습니다.    

### 데이터 병합에 사용가능한 key(병합할 기준이 되는 행 or 열)값이 있는경우
> **`pd`**.**`merge(베이스데이터프레임, 병합할데이터프레임)`**

>> 사용가능한 파라메터
>> - how : 'left', 'right', 'inner', 'outer'
>> - left_on : key값이 다를 경우 베이스데이터프레임의 key 설정
>> - right_on : key값이 다를 경우 병합데이터프레임의 key 설정    


In [90]:
merge_df1 = pd.DataFrame({
    '이름': ['원영', '사쿠라', '유리', '예나', '유진', '나코', '은비', '혜원', '히토미', '채원', '민주', '째욘'],
    '국어': [100, 70, 70, 70, 60, 90, 90, 70, 70, 80, 100, 100],
    '영어': [100, 90, 80, 50, 70, 100, 70, 90, 100, 100, 80, 100]
    }, columns=['이름', '국어', '영어'])

merge_df2 = pd.DataFrame({
    '일어': [80, 100, 100, 90, 70, 50, 100],
    '수학': [90, 70, 100, 80, 70, 80, 90],
    '이름': ['원영', '사쿠라', '나코', '히토미', '예나', '은비', '째욘'],
    }, columns=['일어', '수학', '이름'])

In [103]:
merge_df1

Unnamed: 0,이름,국어,영어
0,원영,100,100
1,사쿠라,70,90
2,유리,70,80
3,예나,70,50
4,유진,60,70
5,나코,90,100
6,은비,90,70
7,혜원,70,90
8,히토미,70,100
9,채원,80,100


In [105]:
# merge 테스트
df1 = pd.merge(merge_df1, merge_df2, how='left')
# 'inner' 양쪽 데이터프레임의 key값 기준 공통된 데이터가 있는 경우에만 데이터 병합 수행 and 조건으로 데이터 병합
# 'outer' 한쪽데이터프레임의 key값 기준으로 데이터가 존재하는 경우 모두 병합 수행 or조건으로 데이터 병합
# 'left' 먼저 전달하는 데이터프레임 기준으로 데이터프레임 병합
# 'right' 후에 전달하는 데이터프레임 기준으로 데이터프레임 병합

### 단순 데이터 연결
> **`pd`**.**`concat([베이스데이터프레임, 병합할데이터프레임], axis=0 or 1)`**  
현재 df에 저장되어있는 데이터에 추가로 데이터를 이어붙여보겠습니다.  
df1 이라는 변수에 이어붙일 데이터를 불러들여 병합을 진행해보겠습니다.  

In [108]:
df1 = pd.read_csv('./data/energy2.csv', encoding='cp949')
df1.head()

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,


In [110]:
df.tail()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0
122399,60,2020-08-24 23,3204.576,27.1,2.6,75.0,0.0,0.0,1.0,1.0


In [None]:
# concat병합은 데이터를 그대로 이어붙이기 때문에 변수 순서나 shape이 동일한지 확인 해야 합니다.
# 모두 참이여야 참

In [111]:
df.shape, df1.shape

((122400, 10), (10080, 9))

In [113]:
# 전력사용량 변수 동일 위치에 추가
df1.insert(2, '전력사용량(kWh)', 0) # 원본데이터에 바로 적용
df1.head()
# 데이터가 삽입될 인덱스 위치, 변수명, 삽입되는 데이터

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,0,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,0,,,,,,,
2,1,2020-08-25 02,0,,,,,,,
3,1,2020-08-25 03,0,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,0,,,,,,,


In [116]:
# 변수명 동일하게 저장
df1.rename({'강수량(mm, 6시간)':'강수량(mm)',
            '일조(hr, 3시간)':'일조(hr)'}, axis=1, inplace=True)
# axis=1 작업방향 설정 0, 1 0-관측치 기준 1-변수기준
# 뷰값을 확인한다 - 작업결과를 미리 확인하는 형태(원본값은 적용 X)
# inplace=True 뷰값으로 확인한 작업결과를 원본값에 적용하는 파라레터

In [117]:
# concat병합은 데이터를 그대로 이어붙이기 때문에 변수 순서나 shape이 동일한지 확인 해야 합니다.
# 모두 참이여야 참
df.shape, df1.shape

((122400, 10), (10080, 10))

In [122]:
(df.columns == df1.columns).all() # 모두 참이여야 참
# (df.columns == df1.columns).any() # 하나라도 참이면 참
# (df.columns == df1.columns).sum()

True

In [135]:
concat_df = pd.concat([df, df1], ignore_index=True)
# axis=0 행기준 병합(기본설정), axis=1 열기준 병합
# ignore_index=True

In [136]:
concat_df

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
132475,60,2020-08-31 19,0.000,,,,,,,
132476,60,2020-08-31 20,0.000,,,,,,,
132477,60,2020-08-31 21,0.000,27.9,4.1,68.0,,0.0,1.0,1.0
132478,60,2020-08-31 22,0.000,,,,,,,


## 인덱스 편집
> 방금 전 concat으로 병합한 데이터프레임의 이상한 점을 찾으셨나요?  
데이터 자체는 잘 붙였지만 인덱스가 꼬여있습니다.  
인덱스 조작은 데이터분석을 위해 필요한 인덱스를 설정하기 위해 필요합니다.

### 인덱스 초기화

In [130]:
# 기존에 엉켜있던 인덱스는 지우고 원본값을 변경하는 매개변수를 추가
concat_df.reset_index(drop=True, inplace=True)
# drop=True 기존 인덱스 버림

### 기존 변수값을 인덱스로 사용

In [132]:
concat_df.set_index('date_time')#, inplace=True)

Unnamed: 0_level_0,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-06-01 00,1,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
2020-06-01 01,1,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2020-06-01 02,1,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
2020-06-01 03,1,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
2020-06-01 04,1,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2020-08-31 19,60,0.000,,,,,,,
2020-08-31 20,60,0.000,,,,,,,
2020-08-31 21,60,0.000,27.9,4.1,68.0,,0.0,1.0,1.0
2020-08-31 22,60,0.000,,,,,,,


## 변수 편집
> 인덱스 조작과 마찬가지로 데이터프레임의 변수값을 변경해야 할 경우도 있습니다.  
데이터프레임은 변수단위 샘플링 및 인덱싱, 이름변경이 가능합니다.

### 변수명 변경

In [137]:
concat_df.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [140]:
# 데이터를 로딩하고 바로 수행
# 한글변수명 --> 영어로
concat_df.rename({'전력사용량(kWh)':'전력사용량',
                  '기온(°C)':'기온',
                  '풍속(m/s)':'풍속',
                  '습도(%)':'습도',
                  '강수량(mm)':'강수량',
                  '일조(hr)':'일조'}, axis=1, inplace=True)

In [141]:
concat_df.to_csv('./data/concat_df.csv', index=False)

In [143]:
df = pd.read_csv('./data/concat_df.csv')

### 파생변수 생성

In [147]:
df['전력사용량2'] = df['전력사용량'] * 1000
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량2
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,8179056.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,8135640.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,8107128.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,8048808.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,8043624.0


In [150]:
df['기온2'] = df['기온'] - df['풍속']
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량2,기온2
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,8179056.0,15.1
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,8135640.0,14.8
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,8107128.0,14.3
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,8048808.0,13.9
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,8043624.0,13.7


In [152]:
df['고온'] = df['기온'] >= 30
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량2,기온2,고온
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,8179056.0,15.1,False
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,8135640.0,14.8,False
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,8107128.0,14.3,False
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,8048808.0,13.9,False
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,8043624.0,13.7,False


In [154]:
# 화씨 변수 생성 섭씨 기온 변수를 기반으로 화씨 변환을 한 데이터를 '화씨' 이름을 가진 변수에 저장
df['화씨'] = (df['기온'] *  9/5) + 32
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량2,기온2,고온,화씨
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,8179056.0,15.1,False,63.68
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,8135640.0,14.8,False,63.86
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,8107128.0,14.3,False,63.5
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,8048808.0,13.9,False,62.78
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,8043624.0,13.7,False,62.6


### 변수삭제

In [155]:
del df['전력사용량2'] # 원본값 바로적용
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,기온2,고온,화씨
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,15.1,False,63.68
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,14.8,False,63.86
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,14.3,False,63.5
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,13.9,False,62.78
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,13.7,False,62.6


In [156]:
pop_test = df.pop('화씨') # 데이터를 뽑아오는 방식 원본데이터에서 전달하는 변수 삭제하면서 데이터를 저장까지 가능
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,기온2,고온
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,15.1,False
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,14.8,False
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,14.3,False
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,13.9,False
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,13.7,False


In [158]:
df['화씨'] = pop_test
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,기온2,고온,화씨
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,15.1,False,63.68
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,14.8,False,63.86
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,14.3,False,63.5
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,13.9,False,62.78
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,13.7,False,62.6


In [160]:
# 여러개 변수를 한꺼번에 삭제하고자 할 때 변수를 리스트로 묶어서 전달
df.drop(['기온2', '고온', '화씨'], axis=1, inplace=True)

### 변수선택

In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132480 entries, 0 to 132479
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   num        132480 non-null  int64  
 1   date_time  132480 non-null  object 
 2   전력사용량      132480 non-null  float64
 3   기온         125760 non-null  float64
 4   풍속         125760 non-null  float64
 5   습도         125760 non-null  float64
 6   강수량        124080 non-null  float64
 7   일조         125760 non-null  float64
 8   비전기냉방설비운영  124696 non-null  float64
 9   태양광보유      124024 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 10.1+ MB


In [166]:
# 데이터타입에 따른 변수 선별
df.select_dtypes(object)

Unnamed: 0,date_time
0,2020-06-01 00
1,2020-06-01 01
2,2020-06-01 02
3,2020-06-01 03
4,2020-06-01 04
...,...
132475,2020-08-31 19
132476,2020-08-31 20
132477,2020-08-31 21
132478,2020-08-31 22


In [168]:
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [171]:
# 변수명 활용 변수선택
df[df.columns[2:]]

Unnamed: 0,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유
0,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
132475,0.000,,,,,,,
132476,0.000,,,,,,,
132477,0.000,27.9,4.1,68.0,,0.0,1.0,1.0
132478,0.000,,,,,,,


In [172]:
# 반복문 for(~까지), while(~동안)
# 단순반복 작업을 컴퓨터가 수행할 수 있게 만들어주는 구문
# for 구간이 정해져 있는 자료구조 혹은 파일에 적용, while 무한반복문
# for문을 활용한 반복문 기본구조
for 반복자(변수) in 반복구간(데이터구조, 자료구조): # 구간을 순환하면서 반복되는 형식 순서가 있는 자료구조 리스트, serise, df.columns, 문자열
    # 반복자 - 반복구간에 속한 데이터를 받아 변수로 할당 순환하면서 값이 변경이 되는 변수
    반복문을 순환하면서 실행시킬 실행코드 # 띄여쓰기 4칸 indantation block 코드가 포함되는 구조를 명시

1
2
3
4


In [175]:
for i in [10, 20, 30, 40]: # 반복구간 전체를 순환하는 형태
    print(i * 10)

100
200
300
400


In [177]:
df.columns

Index(['num', 'date_time', '전력사용량', '기온', '풍속', '습도', '강수량', '일조', '비전기냉방설비운영',
       '태양광보유'],
      dtype='object')

In [179]:
# 반복문을 활용한 변수 활용 예시
for col_nm in df.columns:
    print(col_nm, df[col_nm].nunique())

num 60
date_time 2208
전력사용량 52894
기온 245
풍속 159
습도 92
강수량 315
일조 31
비전기냉방설비운영 2
태양광보유 2


In [183]:
for item in df['전력사용량']:
    print(item * 1000)

8179056.0
8135640.0
8107128.000000002
8048808.000000001
8043624.0
8010576.0
7978175.999999997
8019000.0
8020944.0
8083151.999999999
8116200.0
8104536.0
8088984.0
8102592.0
8088335.999999998
8076672.0
8032608.0
8013816.0
8029368.0
8028071.999999999
7994376.0
7974936.0
7972991.999999998
7945128.000000002
7920504.0
7890048.000000002
7868016.0
7847280.0
7799328.0
7801920.0
7784424.0
7836911.999999998
7873200.0
7914024.0
7973640.0
7989840.0
7935408.0
7923744.000000002
7966512.0
7996320.0
8053991.999999998
8061120.0
8059175.999999997
8061120.0
8066951.999999998
8111664.000000002
8140824.0
8120088.000000002
8114904.0
8094816.0
8090280.0
8091576.0
8057880.0
8038440.0
8019648.0
8094816.0
8151840.0
8201088.0
8261351.999999999
8278200.000000001
8319671.999999999
8363735.999999999
8458992.0
8483616.0
8506944.0
8530920.0
8655984.0
8461584.0
8344296.0
8322912.0
8271719.999999999
8255520.0
8254871.999999999
8230896.000000001
8245799.999999999
8257464.0
8240616.0
8308008.0
8262000.0
8296992.0
8328096.

### apply 함수로 컬럼에 함수 적용
> 인스턴스 함수인 lambda 와 apply 함수를 사용하여 인자로 받는 모든 데이터에 함수를 적용  
커스텀 함수 적용도 가능

> apply 함수로 컬럼에 적용시키는 코드 구조  
 **`df['컬럼명']`** = **`df['컬럼명']`**.**`apply(함수명)`**

In [198]:
# 화씨 컬럼 추가


### 날짜 형식 데이터의 활용

In [186]:
# 문자형식 데이터를 날짜형식으로 형변환
df['date_time'] = pd.to_datetime(df['date_time'])

In [189]:
# loc
df.loc[5, 'date_time'] - df.loc[0, 'date_time']

Timedelta('0 days 05:00:00')

In [196]:
# datetime 데이터 타입의 요일정보를 변수로 추가
print(df.loc[0, 'date_time'].year)
print(df.loc[0, 'date_time'].month)
print(df.loc[0, 'date_time'].day)
print(df.loc[0, 'date_time'].hour)
print(df.loc[0, 'date_time'].minute)
print(df.loc[0, 'date_time'].second)
print(df.loc[0, 'date_time'].microsecond)
print(df.loc[0, 'date_time'].dayofweek) # 0부터 월요일 6 일요일

2020
6
1
0
0
0
0
0


In [199]:
df['hour'] = df['date_time'].dt.hour
df['weekday'] = df['date_time'].dt.dayofweek
df['day'] = df['date_time'].dt.day
df['month'] = df['date_time'].dt.month
df.head()

Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,hour,weekday,day,month
0,1,2020-06-01 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,0,1,6
1,1,2020-06-01 01:00:00,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,0,1,6
2,1,2020-06-01 02:00:00,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,0,1,6
3,1,2020-06-01 03:00:00,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,0,1,6
4,1,2020-06-01 04:00:00,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4,0,1,6


In [200]:
df.to_csv('./data/final.csv', index=False)

### 데이터 범주화
간혹 연속형 데이터를 범주화(카테고리컬 데이터) 시켜야 할 경우가 있습니다.  
범주화를 위한 함수를 알아보겠습니다.

0          저온
1          저온
2          저온
3          저온
4          저온
         ... 
132475    NaN
132476    NaN
132477     고온
132478    NaN
132479    NaN
Name: 기온, Length: 132480, dtype: category
Categories (2, object): ['저온' < '고온']

## 결측치 처리
> 데이터 분석을 위해서는 데이터셋 내에 빈 값이 있는 경우 분석에 방해가 될 수 있는 여지가 많습니다.  
모든 결측치를 없애야 하는 것은 아니지만 되도록이면 결측치를 채우는 방법, 혹은 없애는 방법등으로 결측치를 처리합니다.  
몇가지 예시를 살펴보면서 결측치 처리에 대해 알아봅시다.

In [232]:
# info() 함수는 결측치에 대한 정보도 보여줍니다.
# 컬럼별 isnull() 함수를 사용해도 무방합니다.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132480 entries, 0 to 132479
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   num        132480 non-null  int64         
 1   date_time  132480 non-null  datetime64[ns]
 2   전력사용량      122400 non-null  float64       
 3   기온         125760 non-null  float64       
 4   풍속         125760 non-null  float64       
 5   습도         125760 non-null  float64       
 6   강수량        124080 non-null  float64       
 7   일조         125760 non-null  float64       
 8   비전기냉방설비운영  124696 non-null  float64       
 9   태양광보유      124024 non-null  float64       
 10  전력사용량(Wh)  122400 non-null  float64       
dtypes: datetime64[ns](1), float64(9), int64(1)
memory usage: 11.1 MB


확인결과 num, date_time 변수를 제외 한 다른 변수에 결측치가 존재합니다.  
해당 컬럼의 결측치 샘플들을 살펴보고 결측치를 처리해 보겠습니다.

In [233]:
# 컬럼별 결측치 확인을 위한 isnull()함수 리턴값이 bool 형태로 반환되어 조건부 샘플링이 가능합니다.


Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량(Wh)
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
132475,False,False,True,True,True,True,True,True,True,True,True
132476,False,False,True,True,True,True,True,True,True,True,True
132477,False,False,True,False,False,False,True,False,False,False,True
132478,False,False,True,True,True,True,True,True,True,True,True


### 결측치 비율 계산

In [235]:
# 컬럼명을 순환하면서 결측치 비율 계산


num 0.0
date_time 0.0
전력사용량 0.08235294117647059
기온 0.054901960784313725
풍속 0.054901960784313725
습도 0.054901960784313725
강수량 0.06862745098039216
일조 0.054901960784313725
비전기냉방설비운영 0.06359477124183006
태양광보유 0.06908496732026144
전력사용량(Wh) 0.08235294117647059


In [239]:
# fillna() 함수로 NaN 값을 컬럼의 평균으로 채우기


0         17.60000
1         17.70000
2         17.50000
3         17.10000
4         17.00000
            ...   
132475    24.34667
132476    24.34667
132477    27.90000
132478    24.34667
132479    24.34667
Name: 기온, Length: 132480, dtype: float64

### 결측치 제거

In [240]:
# 마지막에 합니다.


Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량(Wh)
0,1,2020-06-01 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,8179056.0
1,1,2020-06-01 01:00:00,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,8135640.0
2,1,2020-06-01 02:00:00,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,8107128.0
3,1,2020-06-01 03:00:00,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,8048808.0
4,1,2020-06-01 04:00:00,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,8043624.0
...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19:00:00,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0,4114368.0
122396,60,2020-08-24 20:00:00,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0,3975696.0
122397,60,2020-08-24 21:00:00,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0,3572208.0
122398,60,2020-08-24 22:00:00,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0,3299184.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 122400 entries, 0 to 122399
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   num        122400 non-null  int64         
 1   date_time  122400 non-null  datetime64[ns]
 2   전력사용량      122400 non-null  float64       
 3   기온         122400 non-null  float64       
 4   풍속         122400 non-null  float64       
 5   습도         122400 non-null  float64       
 6   강수량        122400 non-null  float64       
 7   일조         122400 non-null  float64       
 8   비전기냉방설비운영  122400 non-null  float64       
 9   태양광보유      122400 non-null  float64       
 10  전력사용량(Wh)  122400 non-null  float64       
dtypes: datetime64[ns](1), float64(9), int64(1)
memory usage: 11.2 MB


이제 모든 컬럼에 결측치가 사라졌습니다. 이후 분석은 시각화를 통해 진행해보겠습니다.