# Pandas Review
- 데이터를 분석하거나, AI를 하기위해 전 단계로 파일을 전처리하는 등의 처리를 위해 필요한 파이썬 라이브러리
- 판다스는 걍 라이브러리일뿐, 의사결정을 하거나 분석을 하는 것은 사람이다.
  데이터를 볼 수 있는 시각을 갖추는 것이 더 중요합니다
  (이클립스 버전이 뛰어나봤자 프로그래밍 능력 없으면 걍 꽝인거랑 마찬가지지...)
- 판다스의 데이터 구조는
  (1)Series(#파이썬의 list를 활용하여 만든다)
  (2)DataFrame(#파이썬의 dictionary를 활용하여 만든다.)
  이다.
- 요즘은 전처리 작업을 엑셀로 하는 일본 기업도 많습니다.

In [1]:
import numpy as np
import pandas as pd

In [4]:
#시리즈 만들기 : pd.Series(데이터)
pd.Series(list(range(5)), index=list(range(1,10,2)),dtype=np.float32)

1    0.0
3    1.0
5    2.0
7    3.0
9    4.0
dtype: float32

In [6]:
#데이터프레임 만들기 : pd.DataFrame(데이터)
pd.DataFrame([5,10,15,20,25], index=['a','b','c','d','e'],columns=['name'])

Unnamed: 0,name
a,5
b,10
c,15
d,20
e,25


In [8]:
pd.Series([1,3,5,7, np.nan, 11,13]) #들어갈 데이터가 없다면 np.nan을 통해 비었다고 명시하자.

0     1.0
1     3.0
2     5.0
3     7.0
4     NaN
5    11.0
6    13.0
dtype: float64

In [11]:
#시계열 
dates = pd.date_range('20190101',periods=6)
dates
#여담 : 주식, 날씨 등이 대표적인 시계열 데이터다.

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) # np.random.randn(6,4) : 24개의 랜덤값 생성한다
df

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-03,-0.991184,0.502055,1.651753,-0.177969
2019-01-04,1.287825,-0.528829,-1.373616,0.201949
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-06,0.484919,1.588221,0.397736,1.103162


In [18]:
# Enum(이늄): 값의 범위가 한정된 경우에 많이 쓰입니다
# (Ex) 성별(남/녀) 요일(월화수목금토일) 등등...

df2 = pd.DataFrame({
        'A' : 10.5,
        'B' : pd.Timestamp('20190101'), #날짜를 넣어보겠습니다
        'C' : pd.Series(5, index=list(range(0,4)),dtype=np.float32),
        'D' : np.array([3]*4, dtype=np.int32),
        'E' : pd.Categorical(['aaa','bbb','ccc','ddd']),
        'F' : 'Python'
})
df2
#행이 4개인 이유는 'E'가 가장 큰 많이 있기에 잠시 기준의 역할을 했을 뿐이다.
#구글트렌드에서 검색하면 프로그래밍언어들의 관심순위들이 뜸.(점유율은 티오베라는 곳에서 확인하셈 ㅇㅅㅇ)

Unnamed: 0,A,B,C,D,E,F
0,10.5,2019-01-01,5.0,3,aaa,Python
1,10.5,2019-01-01,5.0,3,bbb,Python
2,10.5,2019-01-01,5.0,3,ccc,Python
3,10.5,2019-01-01,5.0,3,ddd,Python


In [19]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Pandas의 주요속성 및 메소드
- index
- columns
- values
- T
- .head(개수) / .tail(개수)
- .describe() : 데이터 프레임의 통계정보 출력
- .sort_index() : 행이름이나 열이름을 정렬, axis
- .sort_values(by=칼럼명)

In [21]:
df.describe() # df의 통계정보를 보여줌.

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.42897,0.15332,0.13604,0.279093
std,0.946265,1.160014,1.011994,0.494741
min,-0.991184,-1.083182,-1.373616,-0.177969
25%,-0.212039,-0.812497,-0.305736,-0.095048
50%,0.773302,-0.013387,0.208296,0.153882
75%,1.146621,1.137045,0.501017,0.503189
max,1.287825,1.588221,1.651753,1.103162


In [22]:
df #df가 어떤 값 갖고 있는지 보기용.(큰 의미 없음.)

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-03,-0.991184,0.502055,1.651753,-0.177969
2019-01-04,1.287825,-0.528829,-1.373616,0.201949
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-06,0.484919,1.588221,0.397736,1.103162


In [24]:
df.sort_values(by='A') #A를 기준으로 정렬(sort)함. 내림차순이 기본적용된다.

Unnamed: 0,A,B,C,D
2019-01-03,-0.991184,0.502055,1.651753,-0.177969
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-06,0.484919,1.588221,0.397736,1.103162
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-04,1.287825,-0.528829,-1.373616,0.201949


In [26]:
df.sort_values(axis=1, by='2019-01-02', ascending=False) #A를 기준으로 정렬(sort)함. 올림차순으로 하도록 적용된다.

Unnamed: 0,C,D,A,B
2019-01-01,-0.413933,-0.162002,1.061684,-0.907053
2019-01-02,0.535444,0.105815,-0.444358,-1.083182
2019-01-03,1.651753,-0.177969,-0.991184,0.502055
2019-01-04,-1.373616,0.201949,1.287825,-0.528829
2019-01-05,0.018856,0.603602,1.174933,1.348709
2019-01-06,0.397736,1.103162,0.484919,1.588221


# 데이터 selection

## 1)이름을 이용한 Selection

In [29]:
df

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-03,-0.991184,0.502055,1.651753,-0.177969
2019-01-04,1.287825,-0.528829,-1.373616,0.201949
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-06,0.484919,1.588221,0.397736,1.103162


In [31]:
df['A'] #칼럼 A의 데이터들과 데이터형등을 알 수 있다.

2019-01-01    1.061684
2019-01-02   -0.444358
2019-01-03   -0.991184
2019-01-04    1.287825
2019-01-05    1.174933
2019-01-06    0.484919
Freq: D, Name: A, dtype: float64

In [33]:
type(df['A'])

pandas.core.series.Series

## 2) Slicing을 이용한 선택

In [34]:
df[0:3]

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-03,-0.991184,0.502055,1.651753,-0.177969


In [36]:
df['2019-01-01':'2019-01-03'] #'2019-01-01'부터'2019-01-03'까지의 행들의 데이터들을 갖고 온다. df[0:3]와 결과는 같다.

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-03,-0.991184,0.502055,1.651753,-0.177969


## 3).loc를 이용한 선택

In [38]:
df.loc[dates[0]] #0번째 행(2019-01-01)의 데이터들을 갖고 온다.

A    1.061684
B   -0.907053
C   -0.413933
D   -0.162002
Name: 2019-01-01 00:00:00, dtype: float64

In [40]:
df.loc[:, ['A', 'C']] #모든행(:,)들의 데이터를 갖고온다. 다만 A,C열의 데이터들을 갖고 온다.

Unnamed: 0,A,C
2019-01-01,1.061684,-0.413933
2019-01-02,-0.444358,0.535444
2019-01-03,-0.991184,1.651753
2019-01-04,1.287825,-1.373616
2019-01-05,1.174933,0.018856
2019-01-06,0.484919,0.397736


In [41]:
df.loc['2019-01-04':'2019-01-06', ['A', 'C']] #A,C열의 데이터들을 갖고 온다.

Unnamed: 0,A,C
2019-01-04,1.287825,-1.373616
2019-01-05,1.174933,0.018856
2019-01-06,0.484919,0.397736


In [43]:
df.loc[dates[0], ['A', 'C']] #A,C열의 데이터들을 갖고 온다.

A    1.061684
C   -0.413933
Name: 2019-01-01 00:00:00, dtype: float64

## 4).iloc를 이용한 선택

In [50]:
df.iloc[[1,2,4],[0,2]] #1,2,4행 & 0,2열의 자료를 갖고오고 싶다

Unnamed: 0,A,C
2019-01-02,-0.444358,0.535444
2019-01-03,-0.991184,1.651753
2019-01-05,1.174933,0.018856


## 5).at를 이용한 선택

In [47]:
df.at[dates[0], 'A'] #.at은 리스트형을 인덱스로 받을 수 없다.

1.0616844873809956

## 6) 조건을 이용한 선택 : 불리언 인덱싱(참인 값을 찾기위해 사용)

In [56]:
df

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-03,-0.991184,0.502055,1.651753,-0.177969
2019-01-04,1.287825,-0.528829,-1.373616,0.201949
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-06,0.484919,1.588221,0.397736,1.103162


In [52]:
df[df>0] #0보다 값이 작다면 갖고 오지 않는다. 그래서 NaN으로 표기된다.

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,,,
2019-01-02,,,0.535444,0.105815
2019-01-03,,0.502055,1.651753,
2019-01-04,1.287825,,,0.201949
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-06,0.484919,1.588221,0.397736,1.103162


In [55]:
df[df.A > 1] #A열의 값이 1보다 큰 경우 해당 행들을 출력한다.

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-04,1.287825,-0.528829,-1.373616,0.201949
2019-01-05,1.174933,1.348709,0.018856,0.603602


In [59]:
df[df.B > 1] #B열의 값이 1보다 큰 경우 해당 행들을 출력한다.

Unnamed: 0,A,B,C,D
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-06,0.484919,1.588221,0.397736,1.103162


In [61]:
df2 = df.copy() #.copy() : df를 똑같이 복사해옴.
df2

Unnamed: 0,A,B,C,D
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002
2019-01-02,-0.444358,-1.083182,0.535444,0.105815
2019-01-03,-0.991184,0.502055,1.651753,-0.177969
2019-01-04,1.287825,-0.528829,-1.373616,0.201949
2019-01-05,1.174933,1.348709,0.018856,0.603602
2019-01-06,0.484919,1.588221,0.397736,1.103162


In [63]:
df2['E'] = ['One','Two','Three','One','Two','Three']

In [64]:
df2

Unnamed: 0,A,B,C,D,E
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002,One
2019-01-02,-0.444358,-1.083182,0.535444,0.105815,Two
2019-01-03,-0.991184,0.502055,1.651753,-0.177969,Three
2019-01-04,1.287825,-0.528829,-1.373616,0.201949,One
2019-01-05,1.174933,1.348709,0.018856,0.603602,Two
2019-01-06,0.484919,1.588221,0.397736,1.103162,Three


In [68]:
df2['E'].isin(['Two']) #각 행들에게 'Two'라는 데이터가 들어 있으냐?

2019-01-01    False
2019-01-02     True
2019-01-03    False
2019-01-04    False
2019-01-05     True
2019-01-06    False
Freq: D, Name: E, dtype: bool

In [71]:
df2[df2['E'].isin(['One','Two'])] #'One'또는 'Two'라는 데이터가 들어있는 행은 출력

Unnamed: 0,A,B,C,D,E
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002,One
2019-01-02,-0.444358,-1.083182,0.535444,0.105815,Two
2019-01-04,1.287825,-0.528829,-1.373616,0.201949,One
2019-01-05,1.174933,1.348709,0.018856,0.603602,Two


### Data 변경

In [79]:
mySeries = pd.Series([1,2,3,4,5,6],index=pd.date_range('20190102',periods=6))
mySeries

2019-01-02    1
2019-01-03    2
2019-01-04    3
2019-01-05    4
2019-01-06    5
2019-01-07    6
Freq: D, dtype: int64

In [82]:
df2['F'] = mySeries
df2 
#데이터를 가져오면 몇십만 셀이 되는데 이런식으로 NaN결측값들이 반드시 있다.

Unnamed: 0,A,B,C,D,E,F
2019-01-01,1.061684,-0.907053,-0.413933,-0.162002,One,
2019-01-02,-0.444358,-1.083182,0.535444,0.105815,Two,1.0
2019-01-03,-0.991184,0.502055,1.651753,-0.177969,Three,2.0
2019-01-04,1.287825,-0.528829,-1.373616,0.201949,One,3.0
2019-01-05,1.174933,1.348709,0.018856,0.603602,Two,4.0
2019-01-06,0.484919,1.588221,0.397736,1.103162,Three,5.0


### 결측치 처리하기
- 결측치란 : 어떠한 이유든지 우리가 가진 데이터가 전부 측정되지 못하고 존재하지 않는 경우가 발생할 수 있다.
           이러한 데이터들을 결측치라고 한다.
- np.nan 라는 상수를 결측치를 나타낼 수 있다.
- fillna() : NaN이면 ( )내의 값으로 바꿔준다. .fillna(0)는 NaN을 0으로 채운다는 의미가 된다.  
- dropna() : 결측치를 버린다. 결측치를 0으로 처리해도 영향을 주기 때문.
- isna() : 결측치이냐? 라고 묻는 명령문.

In [90]:
df2.dropna(how='any') #결측치가 있는 행은 통째로 날려버린다.

Unnamed: 0,A,B,C,D,E,F
2019-01-02,-0.444358,-1.083182,0.535444,0.105815,Two,1.0
2019-01-03,-0.991184,0.502055,1.651753,-0.177969,Three,2.0
2019-01-04,1.287825,-0.528829,-1.373616,0.201949,One,3.0
2019-01-05,1.174933,1.348709,0.018856,0.603602,Two,4.0
2019-01-06,0.484919,1.588221,0.397736,1.103162,Three,5.0


In [98]:
#연습을 위해 일부러 결측치를 더 만든다.
df1 = df.reindex(index=dates[0:3], columns=list(['F']) ) #reindex 인덱스 다시 만든다.
df1

Unnamed: 0,F
2019-01-01,
2019-01-02,
2019-01-03,


In [99]:
#df1.loc[dotes[0]:dates[1], 'F'] =1 
df1.fillna(0) #.fillna(0) : NaN을 0으로 채워라.

Unnamed: 0,F
2019-01-01,6.0
2019-01-02,6.0
2019-01-03,6.0


In [100]:
# vlookup 함수 피벗테이블