# Pandas
> 데이터 과학자를 위해 테이블형태로 데이터를 다룰 수 있게 해주는 패키지 (python용 엑셀)  
기존 데이터처리 라이브러리인 numpy 대신 주로 사용  
일반인이 데이터분석을 접하기 쉽게 만들어준 결정적인 라이브러리  
pandas만으로도 충분히 데이터 분석이 가능할 정도로 고수준의 함수들을 내장  
앞으로 진행하는 데이터분석 과정에서 주로 사용하게 될 데이터구조  

## pandas 설치
> 콘솔창에서 실행 시  
**pip install pandas**
    
> 주피터 노트북으로 실행 시  
 **!pip install pandas**
    
> 아나콘다 환경으로 python 환경설정 시 기본적으로 설치가 되어있음

In [1]:
# pandas 설치
!pip install pandas



In [3]:
!say 안녕하세요 수강생 여러분 파이썬 재미있나요

In [4]:
# !pip install pyttsx3
import pyttsx3 # 주피터노트북에서 설치한 패키지를 불러들임
engine = pyttsx3.init()
engine.say("수강생여러분.")
engine.say("파이썬 공부하느라 고생하십니다.")
engine.say("파이썬으로 이런것도 가능해요")
engine.say("하지만 이해못해도 괜챦아요.")
engine.say("왜냐하면 우리는 가져다 쓸꺼니까요. 찡긋")
engine.runAndWait()

In [2]:
# 필요 모듈 import
import numpy as np # 벡터연산, 선형대수학 구현, 수리연산, 통계연산
import pandas as pd # pandas 라는 이름의 패키지를 불러들여라, pd 라는 이름으로 사용하겠다.
import matplotlib.pyplot as plt # 시각화의 numpy 패키지
import seaborn as sns # 분석가들이 가장 선호하는 시각화 패키지 
# 4개 필수 데이터분석용 패키지 100/100 관례적인 룰

## DataFrame 데이터 불러오기
> 엑셀에 익숙한 사용자를 위해 제작 된 테이블형태의 데이터 구조  
다양한 형태의 데이터를 받아 사용할 수 있으며 다양한 통계, 시각화 함수를 제공한다.  
실제 데이터를 불러들이고 값을 확인 해 보며 기본적인 pandas 사용법을 익혀보도록 하겠습니다.

### 데이터 불러오기
pandas는 다양한 데이터 파일 형태를 지원하며 주로 csv, xlsx, sql을 사용한다.
    
> **`read_csv()`**  
**`read_excel()`**  
**`read_sql()`**  
**`read_json()`**  

### csv 파일 로딩

In [12]:
pwd

'/Users/byun/0_lecture/08_samsung_preprocessing/2023_preprocessing_10'

In [3]:
# DataFrame 의 약자로서 형식적으로 df 변수명을 사용한다.
# pandas패키지의 read_csv() 함수를 사용하여 energy1.csv 파일을 불러들여 
# 데이터프레임을 만들고 df 이름의 변수로 저장
df = pd.read_csv('./data/energy1.csv', encoding='cp949') # 경로를 포함한 파일명 + 확장자명을 문자열 형식으로 전달
# encoding='cp949' 컴퓨터가 데이터를 해석하는 방법에 해당하는 옵션, 'utf-8' 웹 표준 방식

### excel 파일 로딩

In [13]:
# 만약 모듈을 찾을 수 없는 오류가 발생한다면 추가 모듈 설치
pd.read_excel('./data/energy1.xlsx', engine='xlsb')
# sheet_name=불러들여야 하는 시트 이름 설정
# header=인덱스 넘버를 부여하면 해당 위치부터 데이터로 판단
# engine=구버전 엑셀파일 로딩을 위한 옵션값 xls, xl, xlsb

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0,0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0,0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0,0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0,0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1,1
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1,1
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1,1
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1,1


### json 파일 로딩

In [14]:
# 컴퓨터간 통신을 할 때 사용하는 파일형태 혹은 자료구조이고 딕셔너리 형태
pd.read_json('./data/005930.json')

Unnamed: 0,symbolCode,date,tradePrice,tradeTime,change,changePrice,changeRate,prevClosingPrice,exchangeCountry,openingPrice,highPrice,lowPrice,accTradePrice,accTradeVolume,periodTradePrice,periodTradeVolume,listedSharesCount
0,A005930,2023-02-03 15:30:19,63800,15:30:19,RISE,300,0.004724,63500,KOREA,63900,64000,63000,942509594000,14804617,942509594000,14804617,
1,A005930,2023-02-02 15:30:07,63500,15:30:07,RISE,1700,0.027508,61800,KOREA,63200,63900,62600,1474629229812,23285983,1474629229812,23285983,
2,A005930,2023-02-01 15:30:23,61800,15:30:23,RISE,800,0.013115,61000,KOREA,62600,62700,61000,1145781815984,18570133,1145781815984,18570133,
3,A005930,2023-01-31 15:30:07,61000,15:30:07,FALL,2300,-0.036335,63300,KOREA,63500,63700,61000,1835768640685,29746731,1835768640685,29746731,
4,A005930,2023-01-30 15:30:19,63300,15:30:19,FALL,1300,-0.020124,64600,KOREA,64900,64900,63100,1337025734920,20995234,1337025734920,20995234,
5,A005930,2023-01-27 15:30:08,64600,15:30:08,RISE,700,0.010955,63900,KOREA,64400,65000,63900,1212764792491,18760182,1212764792491,18760182,
6,A005930,2023-01-26 15:30:14,63900,15:30:14,RISE,500,0.007886,63400,KOREA,63800,63900,63300,846408637700,13278277,846408637700,13278277,
7,A005930,2023-01-25 15:30:24,63400,15:30:24,RISE,1600,0.02589,61800,KOREA,63500,63700,63000,1066200962700,16822710,1066200962700,16822710,
8,A005930,2023-01-20 15:30:22,61800,15:30:22,RISE,300,0.004878,61500,KOREA,62100,62300,61100,595372614900,9646327,595372614900,9646327,
9,A005930,2023-01-19 15:30:25,61500,15:30:25,RISE,1100,0.018212,60400,KOREA,60500,61500,60400,781937546636,12808490,781937546636,12808490,


### API를 활용하여 웹에서 수집한 데이터 로딩

In [15]:
import requests # 파이썬으로 컴퓨터간 통신(프로토콜) 제어 패키지
import json # json 파일 형태 관리
url = 'https://finance.daum.net/api/quote/A005930/days?symbolCode=A005930&page=1&perPage=10&pagination=true'
info = {
    'referer': 'https://finance.daum.net/quotes/A005930?period=day',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}

resp = requests.get(url, headers=info) # 정보 요청
data = json.loads(resp.text) # json을 html
pd.DataFrame(data['data']) # html을 df로

Unnamed: 0,symbolCode,date,tradePrice,tradeTime,change,changePrice,changeRate,prevClosingPrice,exchangeCountry,openingPrice,highPrice,lowPrice,accTradePrice,accTradeVolume,periodTradePrice,periodTradeVolume,listedSharesCount
0,A005930,2023-07-18 15:30:22,72000.0,15:30:22,FALL,1300.0,-0.017735,73300.0,KOREA,73200.0,73500.0,72000.0,814943721900,11248898,814943721900,11248898,
1,A005930,2023-07-17 15:30:05,73300.0,15:30:05,FALL,100.0,-0.001362,73400.0,KOREA,73200.0,73500.0,72800.0,736819227400,10060049,736819227400,10060049,
2,A005930,2023-07-14 15:30:20,73400.0,15:30:20,RISE,1500.0,0.020862,71900.0,KOREA,72500.0,73400.0,72400.0,1159932414471,15882519,1159932414471,15882519,
3,A005930,2023-07-13 15:30:22,71900.0,15:30:22,EVEN,0.0,0.0,71900.0,KOREA,72400.0,72600.0,71900.0,1041644744600,14417279,1041644744600,14417279,
4,A005930,2023-07-12 15:30:05,71900.0,15:30:05,RISE,400.0,0.005594,71500.0,KOREA,71200.0,72000.0,71100.0,742662492675,10375581,742662492675,10375581,
5,A005930,2023-07-11 15:30:04,71500.0,15:30:04,RISE,2000.0,0.028777,69500.0,KOREA,70200.0,71500.0,70100.0,863673766650,12177392,863673766650,12177392,
6,A005930,2023-07-10 15:30:14,69500.0,15:30:14,FALL,400.0,-0.005722,69900.0,KOREA,70000.0,70400.0,69200.0,816772079400,11713926,816772079400,11713926,
7,A005930,2023-07-07 15:30:13,69900.0,15:30:13,FALL,1700.0,-0.023743,71600.0,KOREA,71100.0,71400.0,69800.0,1215404338500,17308877,1215404338500,17308877,
8,A005930,2023-07-06 15:30:23,71600.0,15:30:23,FALL,400.0,-0.005556,72000.0,KOREA,71900.0,72400.0,71500.0,1061491980700,14777667,1061491980700,14777667,
9,A005930,2023-07-05 15:30:24,72000.0,15:30:24,FALL,1000.0,-0.013699,73000.0,KOREA,73000.0,73300.0,71900.0,889637363400,12310610,889637363400,12310610,


### 데이터베이스에서 쿼리를 사용한 데이터 로딩

In [None]:
# 참고! 실습은 하지 않습니다만 쿼리를 사용하여 데이터베이스로부터 데이터프레임을 만드는 것도 가능합니다.
# 데이터베이스로 부터 자료 읽기

# 필요한 모듈 추가 설치 - 각 데이터베이스 별로 다릅니다.
# !pip install pymysql

# sql 모듈 로드하기
# import pymysql
# mysql, mariadb, sqlite, postgresql, ms-sql, oracle, mongodb

# 접속하기
# 접속방법 또한 DB 종류에 따라 다릅니다.
# con = pymysql.connect(host='db서버주소', port=3306, user='id', passwd='pwd', db='dbname')

# query 만들기
# query = 'select * from samples'

# 자료 불러오기
# data = pd.read_sql(query, con=con)

## 데이터 저장하기
불러들인 혹은 작업을 마친 데이터프레임을 다양한 파일형태로 저장이 가능합니다.  
데이터분석 과정은 원본데이터를 되도록이면 유지하며 전처리를 진행하지만 주기적으로 백업은 진행하는 것이 좋습니다.  
> **`to_csv()`**  
**`to_excel()`**  

In [19]:
# 데이터 저장
df.to_csv('./data/save_test.csv', index=False)
# index=False 기존의 인덱스를 값으로 취급하지 않으면서 데이터를 저장

In [20]:
# 엑셀은 오래걸립니다~ 파이썬의 기본인코딩 utf-8, 엑셀에서는 깨짐
df.to_excel('./data/save_test2.xlsx', encoding='cp949')

## 사용 데이터 간략 설명
> 한국에너지관리공단에서 제공한 전력사용량 데이터  
1시간 간격으로 수집 된 60개 건물들의 2020년 6월 1일 부터 2020년 8월 24일까지의 데이터  
건물정보와 기후정보를 활용한 전력사용량을 예측하기 위한 데이터셋  
대회홈페이지 : https://dacon.io/competitions/official/235736/overview/description
>> 각 변수(컬럼) 설명  
>>- num : 건물번호  
>>- date_time : 데이터가 수집 된 날짜, 시간  
>>- 전력사용량 : 수집 된 시점에 사용한 전력량  
>>- 비전기냉방설비운영 : 0-미운영, 1-운영  
>>- 태양광보유 : 0-미보유, 1-보유

In [21]:
df

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0


## 데이터 살펴보기

In [22]:
# 데이터를 불러들인 후 가장 처음 하는 작업
# 데이터의 구조, 형태 파악하기
# 데이터의 첫 5개 샘플 확인하기
df.head()
# 데이터의 하나의 행에 해당하는 데이터를 설명하는 단어도 샘플, 이벤트, 레코드, 인스턴스, 행, 관측치

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [24]:
# 데이터의 마지막 5개 샘플 확인하기
df.tail(10)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
122390,60,2020-08-24 14,4269.456,29.5,3.1,59.0,0.0,1.0,1.0,1.0
122391,60,2020-08-24 15,4323.888,29.5,3.4,58.0,0.0,1.0,1.0,1.0
122392,60,2020-08-24 16,4294.08,29.4,3.1,59.0,0.0,1.0,1.0,1.0
122393,60,2020-08-24 17,4212.432,28.7,2.9,60.0,0.0,1.0,1.0,1.0
122394,60,2020-08-24 18,4189.104,28.5,2.2,66.0,0.0,1.0,1.0,1.0
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0
122399,60,2020-08-24 23,3204.576,27.1,2.6,75.0,0.0,0.0,1.0,1.0


In [25]:
# 데이터의 갯수를 살펴봅니다
len(df)

122400

In [4]:
# 데이터 shape 확인 행렬사이즈 4X5
df.shape # 속성, 기능이 아니라 값 df 만들어지면서 이미 저장되어 있는 값을 확인

(122400, 10)

In [None]:
# 컬럼, 피쳐 feature, 변수

In [5]:
# 데이터의 전반적인 정보를 확인합니다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122400 entries, 0 to 122399
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   num         122400 non-null  int64  
 1   date_time   122400 non-null  object 
 2   전력사용량(kWh)  122400 non-null  float64
 3   기온(°C)      122400 non-null  float64
 4   풍속(m/s)     122400 non-null  float64
 5   습도(%)       122400 non-null  float64
 6   강수량(mm)     122400 non-null  float64
 7   일조(hr)      122400 non-null  float64
 8   비전기냉방설비운영   122400 non-null  float64
 9   태양광보유       122400 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 9.3+ MB


In [6]:
# 데이터의 기초통계량을 확인합니다.
df.describe()

Unnamed: 0,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
count,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0,122400.0
mean,30.5,2324.830866,24.251713,2.151641,80.169848,0.514989,0.213533,0.683333,0.483333
std,17.318173,2058.999326,3.407902,1.514475,15.525862,2.624505,0.370517,0.465178,0.499724
min,1.0,0.0,11.1,0.0,19.0,0.0,0.0,0.0,0.0
25%,15.75,1055.268,21.8,1.1,70.0,0.0,0.0,0.0,0.0
50%,30.5,1700.352,24.2,1.9,84.0,0.0,0.0,1.0,0.0
75%,45.25,2780.487,26.5,2.9,93.0,0.0,0.3,1.0,1.0
max,60.0,17739.225,36.3,20.1,100.0,81.5,1.0,1.0,1.0


In [7]:
# 변수
df.columns

Index(['num', 'date_time', '전력사용량(kWh)', '기온(°C)', '풍속(m/s)', '습도(%)',
       '강수량(mm)', '일조(hr)', '비전기냉방설비운영', '태양광보유'],
      dtype='object')

데이터셋을 살펴 본 결과 시간별로 관측 된 기후데이터와 전력사용량을 확인 할 수 있었습니다.  
각각의 관측치(샘플)에 대한 정보를 유추하고, 건물별로 수집 된 데이터의 특징을 볼 수 있었습니다.  
데이터의 크기, 사이즈, 기초통계량을 바탕으로 조금 더 디테일하게 데이터를 살펴보겠습니다.

## 데이터접근 (인덱싱, 슬라이싱, 샘플링)

In [14]:
df.iloc[0:10] # 색인값으로 [] 안에 전달하는 내용이 무조건 숫자가 들어갑니다.
df.iloc[[0]] # serise 라는 자료구조

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0


In [17]:
# 첫 샘플 혹은 레코드, 인스턴스, 데이터포인트 에 대한 데이터를 살펴보겠습니다.
# 인덱스넘버로 데이터에 접근하는 .iloc[색인]
df.iloc[0].values

array([1, '2020-06-01 00', 8179.056, 17.6, 2.5, 92.0, 0.8, 0.0, 0.0, 0.0],
      dtype=object)

In [18]:
# 10번 인덱스 부터 20번 인덱스 샘플 접근
df.iloc[10:21]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
10,1,2020-06-01 10,8116.2,20.5,3.4,62.0,0.0,1.0,0.0,0.0
11,1,2020-06-01 11,8104.536,22.1,3.6,52.0,0.0,1.0,0.0,0.0
12,1,2020-06-01 12,8088.984,23.1,4.0,49.0,0.0,1.0,0.0,0.0
13,1,2020-06-01 13,8102.592,23.1,5.1,42.0,0.0,1.0,0.0,0.0
14,1,2020-06-01 14,8088.336,23.6,5.1,39.0,0.0,1.0,0.0,0.0
15,1,2020-06-01 15,8076.672,23.8,5.5,40.0,0.0,1.0,0.0,0.0
16,1,2020-06-01 16,8032.608,24.4,3.0,39.0,0.0,1.0,0.0,0.0
17,1,2020-06-01 17,8013.816,23.6,4.4,41.0,0.0,1.0,0.0,0.0
18,1,2020-06-01 18,8029.368,22.7,4.1,42.0,0.0,1.0,0.0,0.0
19,1,2020-06-01 19,8028.072,21.3,4.2,44.0,0.0,1.0,0.0,0.0


In [19]:
# 여러개의 관측치에 접근
df.iloc[[0, 10, 20]]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
10,1,2020-06-01 10,8116.2,20.5,3.4,62.0,0.0,1.0,0.0,0.0
20,1,2020-06-01 20,7994.376,19.8,2.5,51.0,0.0,0.4,0.0,0.0


In [20]:
# 컬럼 단위 샘플 접근
df['전력사용량(kWh)']

0         8179.056
1         8135.640
2         8107.128
3         8048.808
4         8043.624
            ...   
122395    4114.368
122396    3975.696
122397    3572.208
122398    3299.184
122399    3204.576
Name: 전력사용량(kWh), Length: 122400, dtype: float64

In [21]:
# 여러 컬럼 동시 접근
df[['date_time', '전력사용량(kWh)']]

Unnamed: 0,date_time,전력사용량(kWh)
0,2020-06-01 00,8179.056
1,2020-06-01 01,8135.640
2,2020-06-01 02,8107.128
3,2020-06-01 03,8048.808
4,2020-06-01 04,8043.624
...,...,...
122395,2020-08-24 19,4114.368
122396,2020-08-24 20,3975.696
122397,2020-08-24 21,3572.208
122398,2020-08-24 22,3299.184


In [26]:
df.columns[0]

'num'

In [27]:
# row와 columns을 동시에 슬라이싱 하는 속성
# df.loc[인덱스, 컬럼명]
df.loc[10:20, ['date_time','전력사용량(kWh)']]
df.iloc[10:21, 1:3]

Unnamed: 0,date_time,전력사용량(kWh)
10,2020-06-01 10,8116.2
11,2020-06-01 11,8104.536
12,2020-06-01 12,8088.984
13,2020-06-01 13,8102.592
14,2020-06-01 14,8088.336
15,2020-06-01 15,8076.672
16,2020-06-01 16,8032.608
17,2020-06-01 17,8013.816
18,2020-06-01 18,8029.368
19,2020-06-01 19,8028.072


In [51]:
# 컬럼명 인덱싱


'전력사용량(kWh)'

In [52]:
# 컬럼명 순환


num
date_time
전력사용량(kWh)
기온(°C)
풍속(m/s)
습도(%)
강수량(mm)
일조(hr)
비전기냉방설비운영
태양광보유


In [53]:
# 컬럼명을 순환하면서 각 컬럼의 고윳값 출력


num [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60]
date_time ['2020-06-01 00' '2020-06-01 01' '2020-06-01 02' ... '2020-08-24 21'
 '2020-08-24 22' '2020-08-24 23']
전력사용량(kWh) [8179.056 8135.64  8107.128 ... 4294.08  4212.432 3975.696]
기온(°C) [17.6 17.7 17.5 17.1 17.  16.9 16.7 17.8 19.3 20.5 22.1 23.1 23.6 23.8
 24.4 22.7 21.3 19.8 18.6 17.9 17.2 16.6 16.2 15.9 15.7 15.5 14.8 15.
 16.  17.3 16.3 20.4 20.9 21.7 21.2 20.6 20.3 19.9 19.6 19.  18.4 18.3
 18.2 18.1 19.7 20.8 22.  26.1 26.8 27.9 28.2 28.1 21.  20.1 20.  22.4
 23.9 25.  26.  26.4 25.4 24.3 23.2 22.8 21.4 19.5 18.9 18.7 22.3 25.1
 26.3 27.2 27.8 28.5 28.6 25.8 21.1 20.7 19.4 19.2 24.  25.6 29.3 29.4
 30.5 30.6 29.1 27.1 25.2 24.8 23.4 27.4 28.4 28.9 28.  25.7 24.5 23.
 21.6 22.5 25.5 29.8 31.2 31.6 30.7 22.9 27.7 29.5 31.1 31.9 32.6 32.7
 32.5 30.  26.6 24.6 24.9 29.  30.3 31.  28.7 23.7 22.6

## 팬시인덱싱 전달
기본적인 인덱싱 방법에 추가로 조건에 따른 데이터 샘플링도 가능합니다.  
넘파이의 bool 타입데이터를 인덱스로 전달받는 방법으로 조건에 다른 데이터를 선별한다면 조금 더 고차원적인 데이터 선택이 가능합니다.

In [32]:
# 조건식을 인덱스로 받는 팬시인덱싱
# 건물 번호 기준으로 한 건물에 해당하는 관측치 선별
df.loc[df['num'] == 1, '전력사용량(kWh)'].mean()

8543.273488235294

In [35]:
# 전력 사용량 기준으로 15000kWh 이상 사용한 관측치 
df.loc[df['전력사용량(kWh)'] >= 15000]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
14459,8,2020-06-08 11,15177.411,27.9,1.2,53.0,0.0,1.0,1.0,1.0
14464,8,2020-06-08 16,15152.454,31.0,1.5,40.0,0.0,1.0,1.0,1.0
14482,8,2020-06-09 10,15139.800,26.7,0.9,50.0,0.0,1.0,1.0,1.0
14483,8,2020-06-09 11,15417.936,28.7,1.8,45.0,0.0,1.0,1.0,1.0
14485,8,2020-06-09 13,15177.420,30.7,1.4,36.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
16309,8,2020-08-24 13,17423.793,31.5,2.0,56.0,0.0,1.0,1.0,1.0
16310,8,2020-08-24 14,17458.137,31.4,1.9,55.0,0.0,1.0,1.0,1.0
16311,8,2020-08-24 15,17230.671,32.1,2.1,56.0,0.0,1.0,1.0,1.0
16312,8,2020-08-24 16,17300.313,32.3,1.6,54.0,0.0,1.0,1.0,1.0


In [38]:
# 해당시간 데이터보다 나중에 발생한 관측치 선택 
df.loc[df['date_time'] >= '2020-08-01 00']

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
1464,1,2020-08-01 00,8623.584,25.0,3.4,90.0,1.0,0.0,0.0,0.0
1465,1,2020-08-01 01,8591.184,24.6,0.5,90.0,0.2,0.0,0.0,0.0
1466,1,2020-08-01 02,8582.112,24.6,0.5,91.0,0.0,0.0,0.0,0.0
1467,1,2020-08-01 03,8593.776,24.7,1.2,93.0,0.0,0.0,0.0,0.0
1468,1,2020-08-01 04,8575.632,24.7,0.8,94.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0


In [40]:
df.columns

Index(['num', 'date_time', '전력사용량(kWh)', '기온(°C)', '풍속(m/s)', '습도(%)',
       '강수량(mm)', '일조(hr)', '비전기냉방설비운영', '태양광보유'],
      dtype='object')

In [44]:
# or 연산 | 
# and 연산 &
# 조건을 2개 사용한 관측치 선택 비전기냉방설비운영, 태양광보유 한 관측치 찾기
# 고온 다습한 관측치
# 기온이 30도 이상이면서 습도가 80% 이상인 데이터
df.loc[(df['기온(°C)'] >= 30) & (df['습도(%)'] >= 80)]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
18064,9,2020-08-12 16,1308.312,30.0,1.2,83.0,0.0,0.2,0.0,1.0
28238,14,2020-08-11 14,2634.660,30.4,4.9,81.0,0.3,0.3,1.0,1.0
32152,16,2020-08-04 16,1909.008,30.0,2.4,81.0,0.0,0.2,1.0,1.0
32172,16,2020-08-05 12,2010.420,30.4,4.0,81.0,0.0,0.8,1.0,1.0
32362,16,2020-08-13 10,2016.252,30.4,1.7,83.0,0.0,0.4,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
116051,57,2020-08-15 11,2235.168,30.1,3.1,81.0,0.0,0.9,1.0,0.0
116075,57,2020-08-16 11,2255.040,30.6,3.0,81.0,0.0,1.0,1.0,0.0
116082,57,2020-08-16 18,2365.632,30.1,3.2,82.0,0.0,0.5,1.0,0.0
116107,57,2020-08-17 19,2464.992,30.3,0.9,82.0,0.0,0.3,1.0,0.0


In [50]:
# 조건식이 아닌 특정 값 기준으로 데이터를 찾을 때
# 필터링 샘플링 하실 때 조건식보다 조금 더 편하실 수 있음
# 건물번호 10, 20, 30, 40, 50, 60
df.loc[(df['num'] == 10) | (df['num'] == 20)]
df.loc[df['num'].isin([10, 20, 30, 40, 50, 60])]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
18360,10,2020-06-01 00,417.960,18.3,2.5,94.0,0.0,0.0,1.0,0.0
18361,10,2020-06-01 01,407.592,18.8,2.3,89.0,0.0,0.0,1.0,0.0
18362,10,2020-06-01 02,404.028,19.0,3.2,86.0,0.0,0.0,1.0,0.0
18363,10,2020-06-01 03,397.548,19.0,4.1,85.0,0.0,0.0,1.0,0.0
18364,10,2020-06-01 04,397.224,18.9,3.9,87.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0


In [54]:
# contains로 전달하는 문자열이 포함되면 True
df.loc[df['date_time'].str.contains('08-15 00')]
# 커머스 상품명 오뚜기 콩으로만든 간장 5ml 4EA

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
1800,1,2020-08-15 00,8718.192,26.5,2.8,92.0,0.0,0.0,0.0,0.0
3840,2,2020-08-15 00,1082.808,26.3,2.2,89.0,0.0,0.0,1.0,0.0
5880,3,2020-08-15 00,3460.444235,26.8,2.0,93.0,0.0,0.0,1.0,1.0
7920,4,2020-08-15 00,563.328,30.5,3.1,65.0,0.0,0.0,1.0,1.0
9960,5,2020-08-15 00,3512.16,26.3,2.2,89.0,0.0,0.0,1.0,0.0
12000,6,2020-08-15 00,609.12,26.8,2.0,93.0,0.0,0.0,0.0,0.0
14040,7,2020-08-15 00,1311.228,26.5,2.8,92.0,0.0,0.0,1.0,0.0
16080,8,2020-08-15 00,5555.304,26.9,1.8,84.0,0.0,0.0,1.0,1.0
18120,9,2020-08-15 00,1330.02,27.2,1.1,92.0,0.0,0.0,0.0,1.0
20160,10,2020-08-15 00,462.672,26.3,2.2,89.0,0.0,0.0,1.0,0.0


In [56]:
# 랜덤한 샘플의 수 혹은 비율에 따른 선별 방법
df.sample(frac=0.1)
# n=데이터갯수 설정, frac=비율을 전달

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
19476,10,2020-07-17 12,2592.324,25.3,2.9,66.0,0.0,1.0,1.0,0.0
107542,53,2020-07-31 22,1204.308,23.8,0.0,97.0,0.0,0.0,1.0,0.0
68765,34,2020-07-31 05,655.776,23.3,0.1,87.0,0.0,0.0,1.0,1.0
64997,32,2020-08-13 05,2382.696,26.0,1.3,96.0,0.0,0.0,0.0,0.0
48821,24,2020-08-19 05,3125.520,23.0,1.2,100.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
11836,6,2020-08-08 04,594.000,23.9,2.2,90.0,0.0,0.0,0.0,0.0
53494,27,2020-06-19 22,704.970,22.0,3.0,81.0,0.0,0.0,1.0,1.0
22624,12,2020-06-08 16,2757.024,27.5,4.0,50.0,0.0,1.0,1.0,1.0
61422,31,2020-06-10 06,5808.240,22.5,1.4,61.0,0.0,0.0,1.0,0.0


## 집계값 계산

In [70]:
df['태양광보유'].unique()

array([0., 1.])

In [71]:
# 전력사용량 기준 집계값 계산
# 평균, 합, 최대값, 최소값, 평균, 분산
print(df['전력사용량(kWh)'].mean())
print(df['전력사용량(kWh)'].sum())
print(df['전력사용량(kWh)'].max())
print(df['전력사용량(kWh)'].min())
print(df['전력사용량(kWh)'].std())
print(df['전력사용량(kWh)'].var())
print(df['전력사용량(kWh)'].argmax()) # 최대값이 위치한 인덱스 넘버
print(df['전력사용량(kWh)'].argmin()) # 최대값이 위치한 인덱스 넘버
print(df['전력사용량(kWh)'].unique()) # 고윳값 출력
print(df['전력사용량(kWh)'].nunique()) # 고윳값 갯수 출력

2324.830865868444
284559297.98229754
17739.225
0.0
2058.999325845112
4239478.223830625
16166
54684
[8179.056 8135.64  8107.128 ... 4294.08  4212.432 3975.696]
52894


In [78]:
df.iloc[[54684, 54685, 54686, 54687, 54688]]

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
54684,27,2020-08-08 12,0.0,23.0,2.1,88.0,0.3,0.0,1.0,1.0
54685,27,2020-08-08 13,0.0,22.4,1.4,93.0,6.6,0.0,1.0,1.0
54686,27,2020-08-08 14,0.0,23.0,3.2,91.0,5.5,0.0,1.0,1.0
54687,27,2020-08-08 15,0.0,23.5,0.7,95.0,1.0,0.0,1.0,1.0
54688,27,2020-08-08 16,0.0,23.7,1.4,95.0,0.3,0.0,1.0,1.0


In [77]:
# 연속형 데이터(실수데이터) 분포를 가지는 데이터에 잘 적용
# 컬럼 값 기준 가장 큰, 작은 값을 가진 샘플 확인 nlargest
df['전력사용량(kWh)'].nlargest()
# 컬럼 값 기준으로 가장 작은 값 nsmallest
df['전력사용량(kWh)'].nsmallest(10)

54684      0.000
54685      0.000
54686      0.000
54687      0.000
54688      0.000
55044     85.320
6343     138.240
55018    146.880
6463     152.496
6319     152.928
Name: 전력사용량(kWh), dtype: float64

In [80]:
# 카테고리컬 데이터의 경우 고윳값 갯수를 세어 내림차순 정렬
df['태양광보유'].value_counts()

0.0    63240
1.0    59160
Name: 태양광보유, dtype: int64

## 데이터 재구조화
기존의 데이터 샘플링과는 달리 기준점으로 생각할 수 있는 컬럼 값을 기준으로 새롭게 데이터 프레임을 생성하며  
평균, 합, 카운트 등을 통해 데이터를 다차원적으로 분석 할 수 있는 함수를 제공합니다.
> 기준 변수(컬럼)가 한개 일 경우
>> **`df`**.**`groupby('컬럼명')`**.**`agg(집계방법)`**  

> 기준 변수(컬럼)가 두개 이상일 경우  
>>  **`pd`**.**`pivot_table(data=데이터프레임명, index=기준컬럼1, columns=기준컬럼2, values=집계데이터, aggfunc=집계방법)`**  

>> 적용가능한 통계 함수

| 함수명 | 내용 |
|-|-|
| count | 갯수 |
| sum | 합 |
| mean | 평균 |
| median | 중앙값 |
| var, std | 분산, 표준편차 |
| min, max | 최소값, 최대값 |
| unique, nunique | 고윳값, 고윳값 갯수 |
| prod | 곱 |

In [104]:
### groupby, 카테고리컬 분포를 가지고 있는 변수를 기준점으로 사용하는 것이 권장
df.groupby('num').mean(numeric_only=True)[['기온(°C)','풍속(m/s)']]
df.groupby('num', as_index=False).agg({'기온(°C)':['mean', 'max'],
                       '풍속(m/s)': ['mean', 'min', 'var', 'sum']})
# as_index= 기준점으로 사용해야 할 변수값에 따라 인덱스 부여 X, 새로운 인덱스로 데이터프레임 생성

Unnamed: 0_level_0,num,기온(°C),기온(°C),풍속(m/s),풍속(m/s),풍속(m/s),풍속(m/s)
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,min,var,sum
0,1,24.608578,35.2,2.34598,0.0,1.234815,4785.8
1,2,23.623333,33.1,3.19799,0.0,3.770045,6523.9
2,3,24.241275,33.6,1.937402,0.0,1.490513,3952.3
3,4,25.143627,36.3,2.056324,0.0,1.57228,4194.9
4,5,23.623333,33.1,3.19799,0.0,3.770045,6523.9
5,6,24.241275,33.6,1.937402,0.0,1.490513,3952.3
6,7,24.608578,35.2,2.34598,0.0,1.234815,4785.8
7,8,24.124363,33.1,1.305735,0.0,0.581875,2663.7
8,9,24.243284,35.5,0.991127,0.0,0.542957,2021.9
9,10,23.623333,33.1,3.19799,0.0,3.770045,6523.9


In [105]:
df.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [106]:
# pivot_table 기준점으로 사용해야할 변수 2개, 카테고리컬 데이터가 들어가면 가장 좋습니다.
pd.pivot_table(data=df,
               index='비전기냉방설비운영',
               columns='태양광보유',
               values='전력사용량(kWh)',
               aggfunc='mean')

태양광보유,0.0,1.0
비전기냉방설비운영,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2077.030506,1582.298192
1.0,2101.99359,2949.726621


## 데이터로 부터 의미있는 정보 추출하기

In [81]:
df.columns

Index(['num', 'date_time', '전력사용량(kWh)', '기온(°C)', '풍속(m/s)', '습도(%)',
       '강수량(mm)', '일조(hr)', '비전기냉방설비운영', '태양광보유'],
      dtype='object')

In [89]:
# 건물번호 10번의 총 전력사용량
df.loc[df['num'].isin([10]), '전력사용량(kWh)'].sum()

2911414.7000010004

In [93]:
# 기온이 30도 이상인 관측치의 전력사용량 평균
df.loc[df['기온(°C)'] >= 30, '전력사용량(kWh)'].mean()

2901.692666127581

In [110]:
# 평균 전력사용량이 높은 건물번호 순서대로 5개 나열
df.groupby('num').mean(numeric_only=True).nlargest(10, '전력사용량(kWh)')

Unnamed: 0_level_0,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8,8837.364073,24.124363,1.305735,79.162255,0.488922,0.199167,1.0,1.0
1,8543.273488,24.608578,2.34598,75.971569,0.493873,0.184608,0.0,0.0
38,7588.679824,24.608578,2.34598,75.971569,0.493873,0.184608,1.0,1.0
54,6839.836376,24.241275,1.937402,82.667157,0.540882,0.219657,1.0,0.0
31,5964.317576,24.608578,2.34598,75.971569,0.493873,0.184608,1.0,0.0
30,5761.224314,23.853137,5.136127,92.832843,0.298578,0.236569,1.0,1.0
52,3779.359279,24.608578,2.34598,75.971569,0.493873,0.184608,1.0,1.0
42,3624.592027,23.323186,2.781814,80.238824,0.395833,0.246765,1.0,1.0
3,3371.353699,24.241275,1.937402,82.667157,0.540882,0.219657,1.0,1.0
60,3354.771964,23.323186,2.781814,80.238824,0.395833,0.246765,1.0,1.0


## 데이터프레임 병합
> 실제 분석업무를 진행하다보면 데이터가 여기저기 분산되어 있을 경우가 더 많습니다.  
조각난 데이터를 분석에 필요한 데이터셋으로 만들기 위해 데이터프레임 병합을 많이 사용합니다.  
한개 이상의 데이터프레임을 병합 할 때 주로 사용하는 함수 2가지를 알아보겠습니다.    

### 데이터 병합에 사용가능한 key(병합할 기준이 되는 행 or 열)값이 있는경우
> **`pd`**.**`merge(베이스데이터프레임, 병합할데이터프레임)`**

>> 사용가능한 파라메터
>> - how : 'left', 'right', 'inner', 'outer'
>> - left_on : key값이 다를 경우 베이스데이터프레임의 key 설정
>> - right_on : key값이 다를 경우 병합데이터프레임의 key 설정    


In [13]:
merge_df1 = pd.DataFrame({
    '이름': ['원영', '사쿠라', '유리', '예나', '유진', '나코', '은비', '혜원', '히토미', '채원', '민주', '째욘'],
    '국어': [100, 70, 70, 70, 60, 90, 90, 70, 70, 80, 100, 100],
    '영어': [100, 90, 80, 50, 70, 100, 70, 90, 100, 100, 80, 100]
    }, columns=['이름', '국어', '영어'])

merge_df2 = pd.DataFrame({
    '일어': [80, 100, 100, 90, 70, 50, 100],
    '수학': [90, 70, 100, 80, 70, 80, 90],
    '이름': ['원영', '사쿠라', '나코', '히토미', '예나', '은비', '째욘'],
    }, columns=['일어', '수학', '이름'])

Unnamed: 0,일어,수학,name
0,80,90,원영
1,100,70,사쿠라
2,100,100,나코
3,90,80,히토미
4,70,70,예나
5,50,80,은비
6,100,90,째욘


In [16]:
# merge 테스트


Unnamed: 0,이름,국어,영어,일어,수학,name
0,원영,100,100,80,90,원영
1,사쿠라,70,90,100,70,사쿠라
2,나코,90,100,100,100,나코
3,히토미,70,100,90,80,히토미
4,예나,70,50,70,70,예나
5,은비,90,70,50,80,은비
6,째욘,100,100,100,90,째욘


### 단순 데이터 연결
> **`pd`**.**`concat([베이스데이터프레임, 병합할데이터프레임], axis=0 or 1)`**  
현재 df에 저장되어있는 데이터에 추가로 데이터를 이어붙여보겠습니다.  
df1 이라는 변수에 이어붙일 데이터를 불러들여 병합을 진행해보겠습니다.  

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,


In [141]:
# concat병합은 데이터를 그대로 이어붙이기 때문에 컬럼 순서나 shape이 동일한지 확인 해야 합니다.
# 모두 참이여야 참

ValueError: Lengths must match to compare

Index(['num', 'date_time', '전력사용량(kWh)', '기온(°C)', '풍속(m/s)', '습도(%)',
       '강수량(mm)', '일조(hr)', '비전기냉방설비운영', '태양광보유'],
      dtype='object')

Index(['num', 'date_time', '기온(°C)', '풍속(m/s)', '습도(%)', '강수량(mm, 6시간)',
       '일조(hr, 3시간)', '비전기냉방설비운영', '태양광보유'],
      dtype='object')

In [147]:
# 전력사용량 변수 동일 위치에 추가


In [148]:
# 컬럼명 동일하게 저장


In [150]:
# concat병합은 데이터를 그대로 이어붙이기 때문에 컬럼 순서나 shape이 동일한지 확인 해야 합니다.
# 모두 참이여야 참

True

In [151]:

# axis=0 행기준 병합(기본설정), axis=1 열기준 병합

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
10075,60,2020-08-31 19,,,,,,,,
10076,60,2020-08-31 20,,,,,,,,
10077,60,2020-08-31 21,,27.9,4.1,68.0,,0.0,1.0,1.0
10078,60,2020-08-31 22,,,,,,,,


## 인덱스 편집
> 방금 전 concat으로 병합한 데이터프레임의 이상한 점을 찾으셨나요?  
데이터 자체는 잘 붙였지만 인덱스가 꼬여있습니다.  
인덱스 조작은 데이터분석을 위해 필요한 인덱스를 설정하기 위해 필요합니다.

In [153]:
# 뷰를 본다


Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
10075,60,2020-08-31 19,,,,,,,,
10076,60,2020-08-31 20,,,,,,,,
10077,60,2020-08-31 21,,27.9,4.1,68.0,,0.0,1.0,1.0
10078,60,2020-08-31 22,,,,,,,,


In [155]:
# 인덱스 초기화
# 기존에 엉켜있던 인덱스는 지우고 원본값을 변경하는 매개변수를 추가


In [156]:
# 기존 컬럼값을 취해 index로 사용


Unnamed: 0_level_0,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-06-01 00,1,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
2020-06-01 01,1,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2020-06-01 02,1,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
2020-06-01 03,1,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
2020-06-01 04,1,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2020-08-31 19,60,,,,,,,,
2020-08-31 20,60,,,,,,,,
2020-08-31 21,60,,27.9,4.1,68.0,,0.0,1.0,1.0
2020-08-31 22,60,,,,,,,,


## 컬럼 편집
> 인덱스 조작과 마찬가지로 데이터프레임의 컬럼을 변경해야 할 경우도 있습니다.  
데이터프레임은 컬럼단위 샘플링 및 인덱싱, 이름변경이 가능합니다.

In [162]:
# 컬럼명 변경


Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
132475,60,2020-08-31 19,,,,,,,,
132476,60,2020-08-31 20,,,,,,,,
132477,60,2020-08-31 21,,27.9,4.1,68.0,,0.0,1.0,1.0
132478,60,2020-08-31 22,,,,,,,,


In [165]:
# 컬럼 생성


Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력x10,전력x20,전력x30
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,81790.56,163581.12,245371.68
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,81356.4,162712.8,244069.2
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,81071.28,162142.56,243213.84
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,80488.08,160976.16,241464.24
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,80436.24,160872.48,241308.72


In [168]:
# 컬럼삭제


0         245371.68
1         244069.20
2         243213.84
3         241464.24
4         241308.72
            ...    
132475          NaN
132476          NaN
132477          NaN
132478          NaN
132479          NaN
Name: 전력x30, Length: 132480, dtype: float64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132480 entries, 0 to 132479
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   num        132480 non-null  int64  
 1   date_time  132480 non-null  object 
 2   전력사용량      122400 non-null  float64
 3   기온         125760 non-null  float64
 4   풍속         125760 non-null  float64
 5   습도         125760 non-null  float64
 6   강수량        124080 non-null  float64
 7   일조         125760 non-null  float64
 8   비전기냉방설비운영  124696 non-null  float64
 9   태양광보유      124024 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 10.1+ MB


In [178]:
# 데이터타입에 따른 변수 선별


Unnamed: 0,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유
0,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
132475,,,,,,,,
132476,,,,,,,,
132477,,27.9,4.1,68.0,,0.0,1.0,1.0
132478,,,,,,,,


### apply 함수로 컬럼에 함수 적용
> 인스턴스 함수인 lambda 와 apply 함수를 사용하여 인자로 받는 모든 데이터에 함수를 적용  
커스텀 함수 적용도 가능

> apply 함수로 컬럼에 적용시키는 코드 구조  
 **`df['컬럼명']`** = **`df['컬럼명']`**.**`apply(lambda x: func(x) if 조건문)`**  
 **`df['컬럼명']`** = **`df['컬럼명']`**.**`apply(함수명)`**

In [198]:
# Wh 컬럼 추가


### 날짜 형식 데이터의 활용

In [227]:
# 문자형식 데이터를 날짜형식으로 형변환


In [231]:
# datetime 데이터 타입의 요일정보를 변수로 추가


### 데이터 범주화
간혹 연속형 데이터를 범주화(카테고리컬 데이터) 시켜야 할 경우가 있습니다.  
범주화를 위한 함수를 알아보겠습니다.

0          저온
1          저온
2          저온
3          저온
4          저온
         ... 
132475    NaN
132476    NaN
132477     고온
132478    NaN
132479    NaN
Name: 기온, Length: 132480, dtype: category
Categories (2, object): ['저온' < '고온']

## 결측치 처리
> 데이터 분석을 위해서는 데이터셋 내에 빈 값이 있는 경우 분석에 방해가 될 수 있는 여지가 많습니다.  
모든 결측치를 없애야 하는 것은 아니지만 되도록이면 결측치를 채우는 방법, 혹은 없애는 방법등으로 결측치를 처리합니다.  
몇가지 예시를 살펴보면서 결측치 처리에 대해 알아봅시다.

In [232]:
# info() 함수는 결측치에 대한 정보도 보여줍니다.
# 컬럼별 isnull() 함수를 사용해도 무방합니다.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132480 entries, 0 to 132479
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   num        132480 non-null  int64         
 1   date_time  132480 non-null  datetime64[ns]
 2   전력사용량      122400 non-null  float64       
 3   기온         125760 non-null  float64       
 4   풍속         125760 non-null  float64       
 5   습도         125760 non-null  float64       
 6   강수량        124080 non-null  float64       
 7   일조         125760 non-null  float64       
 8   비전기냉방설비운영  124696 non-null  float64       
 9   태양광보유      124024 non-null  float64       
 10  전력사용량(Wh)  122400 non-null  float64       
dtypes: datetime64[ns](1), float64(9), int64(1)
memory usage: 11.1 MB


확인결과 num, date_time 변수를 제외 한 다른 변수에 결측치가 존재합니다.  
해당 컬럼의 결측치 샘플들을 살펴보고 결측치를 처리해 보겠습니다.

In [233]:
# 컬럼별 결측치 확인을 위한 isnull()함수 리턴값이 bool 형태로 반환되어 조건부 샘플링이 가능합니다.


Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량(Wh)
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
132475,False,False,True,True,True,True,True,True,True,True,True
132476,False,False,True,True,True,True,True,True,True,True,True
132477,False,False,True,False,False,False,True,False,False,False,True
132478,False,False,True,True,True,True,True,True,True,True,True


### 결측치 비율 계산

In [235]:
# 컬럼명을 순환하면서 결측치 비율 계산


num 0.0
date_time 0.0
전력사용량 0.08235294117647059
기온 0.054901960784313725
풍속 0.054901960784313725
습도 0.054901960784313725
강수량 0.06862745098039216
일조 0.054901960784313725
비전기냉방설비운영 0.06359477124183006
태양광보유 0.06908496732026144
전력사용량(Wh) 0.08235294117647059


In [239]:
# fillna() 함수로 NaN 값을 컬럼의 평균으로 채우기


0         17.60000
1         17.70000
2         17.50000
3         17.10000
4         17.00000
            ...   
132475    24.34667
132476    24.34667
132477    27.90000
132478    24.34667
132479    24.34667
Name: 기온, Length: 132480, dtype: float64

### 결측치 제거

In [240]:
# 마지막에 합니다.


Unnamed: 0,num,date_time,전력사용량,기온,풍속,습도,강수량,일조,비전기냉방설비운영,태양광보유,전력사용량(Wh)
0,1,2020-06-01 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,8179056.0
1,1,2020-06-01 01:00:00,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,8135640.0
2,1,2020-06-01 02:00:00,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,8107128.0
3,1,2020-06-01 03:00:00,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,8048808.0
4,1,2020-06-01 04:00:00,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,8043624.0
...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19:00:00,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0,4114368.0
122396,60,2020-08-24 20:00:00,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0,3975696.0
122397,60,2020-08-24 21:00:00,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0,3572208.0
122398,60,2020-08-24 22:00:00,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0,3299184.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 122400 entries, 0 to 122399
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   num        122400 non-null  int64         
 1   date_time  122400 non-null  datetime64[ns]
 2   전력사용량      122400 non-null  float64       
 3   기온         122400 non-null  float64       
 4   풍속         122400 non-null  float64       
 5   습도         122400 non-null  float64       
 6   강수량        122400 non-null  float64       
 7   일조         122400 non-null  float64       
 8   비전기냉방설비운영  122400 non-null  float64       
 9   태양광보유      122400 non-null  float64       
 10  전력사용량(Wh)  122400 non-null  float64       
dtypes: datetime64[ns](1), float64(9), int64(1)
memory usage: 11.2 MB


이제 모든 컬럼에 결측치가 사라졌습니다. 이후 분석은 시각화를 통해 진행해보겠습니다.