# Python for Data Analysis - part5

##### Python의 numpy, pandas 등을 정리하였으며 파이썬 라이브러리를 활용한 데이터분석(2판)을 참고하여 작성하였습니다.
##### 해당 자료는 python 3.6 기반으로 작성되었습니다.

## 5. pandas 시작하기
#### - pandas는 numpy의 스타일을 많이 차용했지만 가장 큰 차이점은 pandas는 표 형식의 데이터나 다양한 형태의 데이터를 다루는 데 초점을 맞춤
### 5.1 pandas 자료구조 소개
#### 5.1.1 Series

In [8]:
import pandas as pd
import numpy as np

obj = pd.Series([1,2,3,4])
print(obj)
print('--------------------------------------------')

# values 메서드 - series의 값을 출력
print(obj.values)
print('--------------------------------------------')

# index 메서드 - series의 인덱스 출력
print(obj.index)
print('--------------------------------------------')

obj2 = pd.Series([1,2,3,4], index = ['a','b','c','d'])
print(obj2)
print('--------------------------------------------')

print(obj2.index)
print('--------------------------------------------')

print(obj2['a'])
print('--------------------------------------------')

obj2['c'] = 7
print(obj2)
print('--------------------------------------------')

print(obj2[['c','d','a']])
print('--------------------------------------------')

print(obj2[obj2 > 2])
print('--------------------------------------------')

print(obj2 * 2)
print('--------------------------------------------')

print(np.exp(obj2))
print('--------------------------------------------')

# series는 dictionary로 생각할 수 있음(고정 길이의 정렬된 dictionary)
print('b' in obj2)
print('--------------------------------------------')

0    1
1    2
2    3
3    4
dtype: int64
--------------------------------------------
[1 2 3 4]
--------------------------------------------
RangeIndex(start=0, stop=4, step=1)
--------------------------------------------
a    1
b    2
c    3
d    4
dtype: int64
--------------------------------------------
Index(['a', 'b', 'c', 'd'], dtype='object')
--------------------------------------------
1
--------------------------------------------
a    1
b    2
c    7
d    4
dtype: int64
--------------------------------------------
c    7
d    4
a    1
dtype: int64
--------------------------------------------
c    7
d    4
dtype: int64
--------------------------------------------
a     2
b     4
c    14
d     8
dtype: int64
--------------------------------------------
a       2.718282
b       7.389056
c    1096.633158
d      54.598150
dtype: float64
--------------------------------------------
True
--------------------------------------------


In [17]:
sdata = {'ohio' : 35000, 'texas' : 71000, 'oregon' : 16000, 'utah' : 5000}
obj3 = pd.Series(sdata)
print(obj3)
print('--------------------------------------------')

states = ['california', 'ohio', 'oregon', 'texas']
obj4 = pd.Series(sdata, index = states)
print(obj4)
print('--------------------------------------------')

# pd.isnull() / pd.notnull() - na data 찾음 / na가 아닌 data 찾음
print(pd.isnull(obj4))
print('--------------------------------------------')
print(pd.notnull(obj4))
print('--------------------------------------------')
print(obj4.isnull())
print('--------------------------------------------')

# series 산술 연산에서 색인과 라벨로 자동 정렬
print(obj3 + obj4)
print('--------------------------------------------')

# series 객체와 색인은 모두 name 속성이 존재
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)
print('--------------------------------------------')

# index는 대입하여 변경 가능
obj.index = ['bob', 'steve', 'jeff', 'ryan']
print(obj)
print('--------------------------------------------')

ohio      35000
texas     71000
oregon    16000
utah       5000
dtype: int64
--------------------------------------------
california        NaN
ohio          35000.0
oregon        16000.0
texas         71000.0
dtype: float64
--------------------------------------------
california     True
ohio          False
oregon        False
texas         False
dtype: bool
--------------------------------------------
california    False
ohio           True
oregon         True
texas          True
dtype: bool
--------------------------------------------
california     True
ohio          False
oregon        False
texas         False
dtype: bool
--------------------------------------------
california         NaN
ohio           70000.0
oregon         32000.0
texas         142000.0
utah               NaN
dtype: float64
--------------------------------------------
state
california        NaN
ohio          35000.0
oregon        16000.0
texas         71000.0
Name: population, dtype: float64
-----------------

#### 5.1.2 DataFrame
#### - 색인의 모양이 같은 Series 객체의 집합

In [20]:
data = {'state' : ['ohio', 'ohio', 'ohio', 'nevada', 'nevada', 'nevada'],
       'year' : [2000, 2001, 2002, 2001, 2002, 2003],
       'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)

print(frame)
print('--------------------------------------------')

print(frame.head())
print('--------------------------------------------')

# columns 순서를 지정 가능
print(pd.DataFrame(data, columns = ['year', 'state', 'pop']))
print('--------------------------------------------')

    state  year  pop
0    ohio  2000  1.5
1    ohio  2001  1.7
2    ohio  2002  3.6
3  nevada  2001  2.4
4  nevada  2002  2.9
5  nevada  2003  3.2
--------------------------------------------
    state  year  pop
0    ohio  2000  1.5
1    ohio  2001  1.7
2    ohio  2002  3.6
3  nevada  2001  2.4
4  nevada  2002  2.9
--------------------------------------------
   year   state  pop
0  2000    ohio  1.5
1  2001    ohio  1.7
2  2002    ohio  3.6
3  2001  nevada  2.4
4  2002  nevada  2.9
5  2003  nevada  3.2
--------------------------------------------


In [24]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 
                                                                                'four', 'five', 'six'])
print(frame2)
print('--------------------------------------------')

print(frame2.columns)
print('--------------------------------------------')

# columns 접근
print(frame2['state'])
print('--------------------------------------------')

print(frame2.year)
print('--------------------------------------------')

# index 접근
print(frame2.loc['three'])
print('--------------------------------------------')

frame2['debt'] = 16.5
print(frame2)
print('--------------------------------------------')

frame2['debt'] = np.arange(6)
print(frame2)
print('--------------------------------------------')

       year   state  pop debt
one    2000    ohio  1.5  NaN
two    2001    ohio  1.7  NaN
three  2002    ohio  3.6  NaN
four   2001  nevada  2.4  NaN
five   2002  nevada  2.9  NaN
six    2003  nevada  3.2  NaN
--------------------------------------------
Index(['year', 'state', 'pop', 'debt'], dtype='object')
--------------------------------------------
one        ohio
two        ohio
three      ohio
four     nevada
five     nevada
six      nevada
Name: state, dtype: object
--------------------------------------------
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
--------------------------------------------
year     2002
state    ohio
pop       3.6
debt      NaN
Name: three, dtype: object
--------------------------------------------
       year   state  pop  debt
one    2000    ohio  1.5  16.5
two    2001    ohio  1.7  16.5
three  2002    ohio  3.6  16.5
four   2001  nevada  2.4  16.5
five   2002  nevada  2.9  16.5
six    2

In [27]:
# series를 대입하면 dataframe의 index 값에 따라 대입
val = pd.Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
print('--------------------------------------------')

frame2['eastern'] = frame2.state == 'ohio'
print(frame2)
print('--------------------------------------------')

# del - column 삭제
del frame2['eastern']
print(frame2.columns)
print('--------------------------------------------')

       year   state  pop  debt  eastern
one    2000    ohio  1.5   NaN     True
two    2001    ohio  1.7  -1.2     True
three  2002    ohio  3.6   NaN     True
four   2001  nevada  2.4  -1.5    False
five   2002  nevada  2.9  -1.7    False
six    2003  nevada  3.2   NaN    False
--------------------------------------------
       year   state  pop  debt  eastern
one    2000    ohio  1.5   NaN     True
two    2001    ohio  1.7  -1.2     True
three  2002    ohio  3.6   NaN     True
four   2001  nevada  2.4  -1.5    False
five   2002  nevada  2.9  -1.7    False
six    2003  nevada  3.2   NaN    False
--------------------------------------------
Index(['year', 'state', 'pop', 'debt'], dtype='object')
--------------------------------------------


#### - index를 통해 얻은 컬럼은 내부 데이터에 대한 뷰이다. 그렇기에 복사가 이루어지지 않고 원본데이터의 변경이 이루어진다. 
#### - 복사를 위해서는 copy 메서드를 활용해야 한다. 

In [31]:
pop = {'nevada' : {2001 : 2.4, 2002 : 2.9},
      'ohio' : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}}

frame3 = pd.DataFrame(pop)
print(frame3)
print('--------------------------------------------')

# T 메서드 - 전치
print(frame3.T)
print('--------------------------------------------')

print(pd.DataFrame(pop, index = [2001, 2002, 2003]))
print('--------------------------------------------')

# values 메서드 - 2차원 배열로 dataframe 값을 반환
print(frame3.values)
print('--------------------------------------------')

      nevada  ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
--------------------------------------------
        2001  2002  2000
nevada   2.4   2.9   NaN
ohio     1.7   3.6   1.5
--------------------------------------------
      nevada  ohio
2001     2.4   1.7
2002     2.9   3.6
2003     NaN   NaN
--------------------------------------------
[[2.4 1.7]
 [2.9 3.6]
 [nan 1.5]]
--------------------------------------------


#### - dataframe 생성을 위한 입력 데이터 종류 : 2차원 ndarray, 배열, 리스트 ,튜플의 사전, numpy 구조화 배열, series 사전, 사전의 사전

#### 5.1.3 색인 객체

In [37]:
obj = pd.Series(range(3), index = ['a', 'b', 'c'])
index = obj.index
print(index)
print('--------------------------------------------')
print(index[1:])
print('--------------------------------------------')

# 색인 객체는 변경이 불가능 하다. 
# print(index[1] = 'd') - typeerror 발생

labels = pd.Index(np.arange(3))
print(labels)
print('--------------------------------------------')

obj2 = pd.Series([1.5, -2.5, 0], index = labels)
print(obj2)
print('--------------------------------------------')

print(obj2.index is labels)
print('--------------------------------------------')

# pandas의 인덱스는 중복되는 값을 허용
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
print(dup_labels)
print('--------------------------------------------')

Index(['a', 'b', 'c'], dtype='object')
--------------------------------------------
Index(['b', 'c'], dtype='object')
--------------------------------------------
Int64Index([0, 1, 2], dtype='int64')
--------------------------------------------
0    1.5
1   -2.5
2    0.0
dtype: float64
--------------------------------------------
True
--------------------------------------------
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')
--------------------------------------------


#### - index 메서드와 속성 : append(추가), difference(차집합), interseciton(교집합), union(합집합), isin(존재 여부), delete(삭제), unique(유일한 값), is_monotonic(단조성 여부), is_unique(중복 색인 여부)

### 5.2 핵심 기능
#### 5.2.1 재색인

In [50]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])

print(obj)
print('--------------------------------------------')

# reindex 메서드 - index 재지정 / method 옵션의 ffill - 누락된 값을 직전의 값으로 채움
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)
print('--------------------------------------------')

obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0,2,4])
print(obj3)
print('--------------------------------------------')

print(obj3.reindex(range(6), method = 'ffill'))
print('--------------------------------------------')

# reindex는 row, column 둘 다 변경 가능, 그냥 순서만 전달하면 row가 재색인
frame = pd.DataFrame(np.arange(9).reshape(3,3), index = ['a', 'c', 'b'], 
                    columns = ['ohio', 'texas', 'california'])
print(frame)
print('--------------------------------------------')

frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
print('--------------------------------------------')

states = ['texas', 'utah', 'california']
print(frame.reindex(columns = states))
print('--------------------------------------------')

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
--------------------------------------------
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
--------------------------------------------
0      blue
2    purple
4    yellow
dtype: object
--------------------------------------------
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
--------------------------------------------
   ohio  texas  california
a     0      1           2
c     3      4           5
b     6      7           8
--------------------------------------------
   ohio  texas  california
a   0.0    1.0         2.0
b   6.0    7.0         8.0
c   3.0    4.0         5.0
d   NaN    NaN         NaN
--------------------------------------------
   texas  utah  california
a      1   NaN           2
c      4   NaN           5
b      7   NaN           8
--------------------------------------------


#### 5.2.2 하나의 로우나 컬럼 삭제하기

In [55]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
print(obj)
print('--------------------------------------------')

# drop 메서드 - 선택한 값들이 삭제
new_obj = obj.drop('c')
print(new_obj)
print('--------------------------------------------')

print(obj.drop(['d', 'c']))
print('--------------------------------------------')

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
--------------------------------------------
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
--------------------------------------------
a    0.0
b    1.0
e    4.0
dtype: float64
--------------------------------------------


In [57]:
data = pd.DataFrame(np.arange(16).reshape(4,4), index = ['ohio', 'colorado', 'utah', 'new york'], 
                   columns = ['one', 'two', 'three', 'four'])
print(data)
print('--------------------------------------------')

# drop - default는 axis = 0(row 삭제)
print(data.drop(['colorado', 'ohio']))
print('--------------------------------------------')

# drop(axis = 1) - column 삭제
print(data.drop('two', axis = 1))
print('--------------------------------------------')
print(data.drop(['two', 'four'], axis = 'columns'))
print('--------------------------------------------')

          one  two  three  four
ohio        0    1      2     3
colorado    4    5      6     7
utah        8    9     10    11
new york   12   13     14    15
--------------------------------------------
          one  two  three  four
utah        8    9     10    11
new york   12   13     14    15
--------------------------------------------
          one  three  four
ohio        0      2     3
colorado    4      6     7
utah        8     10    11
new york   12     14    15
--------------------------------------------
          one  three
ohio        0      2
colorado    4      6
utah        8     10
new york   12     14
--------------------------------------------


#### 5.2.3 색인하기, 선택하기 , 거르기

In [59]:
obj = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
print(obj)
print('--------------------------------------------')

print(obj['b'])
print('--------------------------------------------')

print(obj[1])
print('--------------------------------------------')

print(obj[2:4])
print('--------------------------------------------')

print(obj[['b','a', 'd']])
print('--------------------------------------------')

print(obj[[1,3]])
print('--------------------------------------------')

print(obj[obj < 2])
print('--------------------------------------------')

# 라벨 이름으로 슬라이싱을 하면 시작점과 끝점을 모두 포함
print(obj['b':'c'])
print('--------------------------------------------')

obj['b' : 'c'] = 10
print(obj)
print('--------------------------------------------')

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
--------------------------------------------
1.0
--------------------------------------------
1.0
--------------------------------------------
c    2.0
d    3.0
dtype: float64
--------------------------------------------
b    1.0
a    0.0
d    3.0
dtype: float64
--------------------------------------------
b    1.0
d    3.0
dtype: float64
--------------------------------------------
a    0.0
b    1.0
dtype: float64
--------------------------------------------
b    1.0
c    2.0
dtype: float64
--------------------------------------------
a     0.0
b    10.0
c    10.0
d     3.0
dtype: float64
--------------------------------------------


In [61]:
data = pd.DataFrame(np.arange(16).reshape(4,4), index = ['ohio', 'colorado', 'utah', 'new york'], 
                   columns = ['one', 'two', 'three', 'four'])

print(data)
print('--------------------------------------------')

print(data['two'])
print('--------------------------------------------')

print(data[['three', 'one']])
print('--------------------------------------------')

print(data[:2])
print('--------------------------------------------')

print(data[data['three'] > 5])
print('--------------------------------------------')

          one  two  three  four
ohio        0    1      2     3
colorado    4    5      6     7
utah        8    9     10    11
new york   12   13     14    15
--------------------------------------------
ohio         1
colorado     5
utah         9
new york    13
Name: two, dtype: int64
--------------------------------------------
          three  one
ohio          2    0
colorado      6    4
utah         10    8
new york     14   12
--------------------------------------------
          one  two  three  four
ohio        0    1      2     3
colorado    4    5      6     7
--------------------------------------------
          one  two  three  four
colorado    4    5      6     7
utah        8    9     10    11
new york   12   13     14    15
--------------------------------------------


In [64]:
# loc(이름)와 iloc(정수) 활용
print(data.loc['colorado', ['two', 'three']])
print('--------------------------------------------')

print(data.iloc[2, [3,0,1]])
print('--------------------------------------------')

print(data.iloc[2])
print('--------------------------------------------')

print(data.iloc[[1,2], [3,0,1]])
print('--------------------------------------------')

print(data.loc[:'utah', 'two'])
print('--------------------------------------------')

print(data.iloc[:, :3][data.three > 5])
print('--------------------------------------------')

two      5
three    6
Name: colorado, dtype: int64
--------------------------------------------
four    11
one      8
two      9
Name: utah, dtype: int64
--------------------------------------------
one       8
two       9
three    10
four     11
Name: utah, dtype: int64
--------------------------------------------
          four  one  two
colorado     7    4    5
utah        11    8    9
--------------------------------------------
ohio        1
colorado    5
utah        9
Name: two, dtype: int64
--------------------------------------------
          one  two  three
colorado    4    5      6
utah        8    9     10
new york   12   13     14
--------------------------------------------


#### 5.2.4 정수 색인

In [66]:
ser = pd.Series(np.arange(3.))
print(ser)
print('--------------------------------------------')

ser2 = pd.Series(np.arange(3.), index = ['a', 'b', 'c'])
print(ser2[-1])
print('--------------------------------------------')

print(ser[:1])
print('--------------------------------------------')

print(ser.loc[:1])
print('--------------------------------------------')

print(ser.iloc[:1])
print('--------------------------------------------')

0    0.0
1    1.0
2    2.0
dtype: float64
--------------------------------------------
2.0
--------------------------------------------
0    0.0
dtype: float64
--------------------------------------------
0    0.0
1    1.0
dtype: float64
--------------------------------------------
0    0.0
dtype: float64
--------------------------------------------


#### 5.2.5 산술 연산과 데이터 정렬

In [68]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

print(s1)
print('--------------------------------------------')

print(s2)
print('--------------------------------------------')

print(s1 + s2)
print('--------------------------------------------')

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
--------------------------------------------
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
--------------------------------------------
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
--------------------------------------------


In [70]:
df1 = pd.DataFrame(np.arange(9.).reshape(3,3), columns = list('bcd'), index = ['ohio', 'texas', 'colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape(4,3), columns= list('bed'), index = ['utah', 'ohio', 'texas', 'oregon'])

print(df1)
print('--------------------------------------------')

print(df2)
print('--------------------------------------------')

# 공통으로 존재하지 않는 값은 nan 처리
print(df1 + df2)
print('--------------------------------------------')

            b    c    d
ohio      0.0  1.0  2.0
texas     3.0  4.0  5.0
colorado  6.0  7.0  8.0
--------------------------------------------
          b     e     d
utah    0.0   1.0   2.0
ohio    3.0   4.0   5.0
texas   6.0   7.0   8.0
oregon  9.0  10.0  11.0
--------------------------------------------
            b   c     d   e
colorado  NaN NaN   NaN NaN
ohio      3.0 NaN   7.0 NaN
oregon    NaN NaN   NaN NaN
texas     9.0 NaN  13.0 NaN
utah      NaN NaN   NaN NaN
--------------------------------------------


In [75]:
# 산술 연산 메서드에 채워 넣을 값 지정하기
df1 = pd.DataFrame(np.arange(12.).reshape(3,4), columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape(4,5), columns = list('abced'))
df2.loc[1, 'b'] = np.nan

print(df1)
print('--------------------------------------------')

print(df2)
print('--------------------------------------------')

print(df1 + df2)
print('--------------------------------------------')

# 존재하지 않는 축의 값을 특정 값으로 지정
print(df1.add(df2, fill_value = 0))
print('--------------------------------------------')

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
--------------------------------------------
      a     b     c     e     d
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
--------------------------------------------
      a     b     c     d   e
0   0.0   2.0   4.0   7.0 NaN
1   9.0   NaN  13.0  16.0 NaN
2  18.0  20.0  22.0  25.0 NaN
3   NaN   NaN   NaN   NaN NaN
--------------------------------------------
      a     b     c     d     e
0   0.0   2.0   4.0   7.0   3.0
1   9.0   5.0  13.0  16.0   8.0
2  18.0  20.0  22.0  25.0  13.0
3  15.0  16.0  17.0  19.0  18.0
--------------------------------------------


In [76]:
# DataFrame과 Series 간의 연산
arr = np.arange(12.).reshape((3,4))
print(arr)
print('--------------------------------------------')

print(arr[0])
print('--------------------------------------------')

# row에 대해 계산 수행 - 브로드캐스팅
print(arr - arr[0])
print('--------------------------------------------')

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
--------------------------------------------
[0. 1. 2. 3.]
--------------------------------------------
[[0. 0. 0. 0.]
 [4. 4. 4. 4.]
 [8. 8. 8. 8.]]
--------------------------------------------


In [81]:
frame = pd.DataFrame(np.arange(12.).reshape(4,3), columns = list('bde'), 
                    index = ['utah', 'ohio', 'texas', 'oregon'])
series = frame.iloc[0]

print(frame)
print('--------------------------------------------')
print(series)
print('--------------------------------------------')

print(frame - series)
print('--------------------------------------------')

series2 =pd.Series(range(3), index = ['b', 'e', 'f'])

print(frame + series2)
print('--------------------------------------------')

series3 = frame['d']
print(frame)
print('--------------------------------------------')

print(series3)
print('--------------------------------------------')

# axis = 'index' / axis = 0 : row를 따라서 연산을 수행
print(frame.sub(series3, axis = 'index'))
print('--------------------------------------------')

          b     d     e
utah    0.0   1.0   2.0
ohio    3.0   4.0   5.0
texas   6.0   7.0   8.0
oregon  9.0  10.0  11.0
--------------------------------------------
b    0.0
d    1.0
e    2.0
Name: utah, dtype: float64
--------------------------------------------
          b    d    e
utah    0.0  0.0  0.0
ohio    3.0  3.0  3.0
texas   6.0  6.0  6.0
oregon  9.0  9.0  9.0
--------------------------------------------
          b   d     e   f
utah    0.0 NaN   3.0 NaN
ohio    3.0 NaN   6.0 NaN
texas   6.0 NaN   9.0 NaN
oregon  9.0 NaN  12.0 NaN
--------------------------------------------
          b     d     e
utah    0.0   1.0   2.0
ohio    3.0   4.0   5.0
texas   6.0   7.0   8.0
oregon  9.0  10.0  11.0
--------------------------------------------
utah       1.0
ohio       4.0
texas      7.0
oregon    10.0
Name: d, dtype: float64
--------------------------------------------
          b    d    e
utah   -1.0  0.0  1.0
ohio   -1.0  0.0  1.0
texas  -1.0  0.0  1.0
oregon -1.0  0.0  1.0
--

#### 5.2.6 함수 적용과 매핑

In [90]:
frame = pd.DataFrame(np.random.randn(4,3), columns = list('bde'), 
                    index = ['utah', 'ohio', 'texas', 'oregon'])

print(frame)
print('--------------------------------------------')

print(np.abs(frame))
print('--------------------------------------------')

f = lambda x: x.max() - x.min()

# apply 메서드 - dataframe 칼럼이나 로우의 1차원 배열에 함수를 적용
print(frame.apply(f))
print('--------------------------------------------')

# apply(fun, axis = 'columns') - row에 대해 수행
print(frame.apply(f, axis = 'columns'))
print('--------------------------------------------')

def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

print(frame.apply(f))
print('--------------------------------------------')

# applymap 메서드 - dataframe의 모든 원소에 함수 적용
format = lambda x: '%.2f' %x

print(frame.applymap(format))
print('--------------------------------------------')

# map 메서드 - series 각 원소에 함수 적용
print(frame['e'].map(format))
print('--------------------------------------------')

# sort_values - columns 정렬
print(frame.sort_values(by = ['d', 'b']))
print('--------------------------------------------')

# rank - 유효한 데이터 개수의 순서를 매김
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
print('--------------------------------------------')

# rand(method = 'first') - data 상에서 나타나는 순서에 따른 순위 -> 동점 처리
print(obj.rank(method = 'first'))
print('--------------------------------------------')

# rank(ascending = 'False') - 내림차순
print(obj.rank(ascending=False, method = 'max'))
print('--------------------------------------------')

               b         d         e
utah    0.243939  1.704975 -0.015645
ohio   -0.974838 -0.704917 -0.589675
texas  -1.169416 -2.184195  1.681422
oregon  0.514460 -0.565314  0.139569
--------------------------------------------
               b         d         e
utah    0.243939  1.704975  0.015645
ohio    0.974838  0.704917  0.589675
texas   1.169416  2.184195  1.681422
oregon  0.514460  0.565314  0.139569
--------------------------------------------
b    1.683877
d    3.889170
e    2.271098
dtype: float64
--------------------------------------------
utah      1.720621
ohio      0.385163
texas     3.865617
oregon    1.079775
dtype: float64
--------------------------------------------
            b         d         e
min -1.169416 -2.184195 -0.589675
max  0.514460  1.704975  1.681422
--------------------------------------------
            b      d      e
utah     0.24   1.70  -0.02
ohio    -0.97  -0.70  -0.59
texas   -1.17  -2.18   1.68
oregon   0.51  -0.57   0.14
---------------

#### rank의 동률 처리 메서드 - average(default, 순위 평균값), min(낮은 순위), max(높은 순위), first(데이터 내 위치), dense(min과 같지만 같은 그룹 내에서 모두 같은 순위를 적용하지 않음)

#### 5.2.8 중복 색인

In [92]:
obj = pd.Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
print(obj)
print('--------------------------------------------')

print(obj.index.is_unique)
print('--------------------------------------------')

print(obj['a'])
print('--------------------------------------------')

print(obj['c'])
print('--------------------------------------------')

a    0
a    1
b    2
b    3
c    4
dtype: int64
--------------------------------------------
False
--------------------------------------------
a    0
a    1
dtype: int64
--------------------------------------------
4
--------------------------------------------


In [93]:
df = pd.DataFrame(np.random.randn(4,3), index = ['a', 'a', 'b', 'b'])
print(df)
print('--------------------------------------------')

print(df.loc['b'])
print('--------------------------------------------')

          0         1         2
a -0.115120  1.609206 -0.054442
a  1.061061  0.997557  0.613985
b  0.347132  0.749468  1.617523
b  0.886115 -0.831242 -0.380927
--------------------------------------------
          0         1         2
b  0.347132  0.749468  1.617523
b  0.886115 -0.831242 -0.380927
--------------------------------------------


### 5.3 기술 통계 계산과 요약
#### - pandas의 수학, 통계 메서드는 로우나 컬럼에서 단일 값을 구하는 축소(요약) 통계
#### - pandas 메서드는 처음부터 누락된 데이터를 제외하도록 설계

In [96]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], 
                 index = ['a', 'b', 'c', 'd'], columns = ['one', 'two'])

print(df)
print('--------------------------------------------')

print(df.sum())
print('--------------------------------------------')

# axis = 'columns' / axis = 1 : column 계산
print(df.sum(axis = 'columns'))
print('--------------------------------------------')

# skipna를 통해 na를 제외하지 않고 계산
print(df.mean(axis = 'columns', skipna = False))
print('--------------------------------------------')

# idxmax - 최대값을 가진 index 값 반환 / idxmin - 최소값을 가진 index값 반환
print(df.idxmax())
print('--------------------------------------------')

print(df.cumsum())
print('--------------------------------------------')

print(df.describe())
print('--------------------------------------------')

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
--------------------------------------------
one    9.25
two   -5.80
dtype: float64
--------------------------------------------
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
--------------------------------------------
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
--------------------------------------------
one    b
two    d
dtype: object
--------------------------------------------
    one  two
a  1.40  NaN
b  8.50 -4.5
c   NaN  NaN
d  9.25 -5.8
--------------------------------------------
            one       two
count  3.000000  2.000000
mean   3.083333 -2.900000
std    3.493685  2.262742
min    0.750000 -4.500000
25%    1.075000 -3.700000
50%    1.400000 -2.900000
75%    4.250000 -2.100000
max    7.100000 -1.300000
--------------------------------------------


#### 요약 통계 메서드 - count(na 제외한 개수), describe(요약 통계), min(최소), max(최대), argmin(최소값의 index), argmax(최대값의 index), quantile(분위수), sum(합), mean(평균), median(중앙값), mad(평균값에서 평균절대편차 계산), prod(곱), var(표본분산), std(표본표준편차), skew(왜도), kurt(첨도), cumsum(누적합), cummin(누적 최소값), cummax(누적 최대값), cumpord(누적곱), diff(1차 산술차, 차분), pct_change(퍼센트 변화율, 수익률)

#### 5.3.1 상관관계와 공분산

In [102]:
# !pip install pandas-datareader

In [104]:
import pandas_datareader.data as web

all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker : data['Adj Close'] for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [109]:
returns = price.pct_change()

print(returns.tail())
print('--------------------------------------------')

# 상관관계
print(returns['MSFT'].corr(returns['IBM']))
print('--------------------------------------------')

# 공분산
print(returns['MSFT'].cov(returns['IBM']))
print('--------------------------------------------')

print(returns.corr())
print('--------------------------------------------')

print(returns.cov())
print('--------------------------------------------')

# corrwith 메서드 - 다른 series나 dataframe과의 상관관계
print(returns.corrwith(returns['IBM']))
print('--------------------------------------------')

print(returns.corrwith(volume))
print('--------------------------------------------')

                AAPL       IBM      MSFT      GOOG
Date                                              
2021-05-05  0.001956 -0.003636 -0.005327  0.001058
2021-05-06  0.012802  0.022036  0.013227  0.010442
2021-05-07  0.005327 -0.008993  0.010932  0.007282
2021-05-10 -0.025805  0.004881 -0.020914 -0.023775
2021-05-11 -0.026960 -0.016556 -0.012576 -0.024671
--------------------------------------------
0.5309554727588484
--------------------------------------------
0.00015044052667522646
--------------------------------------------
          AAPL       IBM      MSFT      GOOG
AAPL  1.000000  0.446719  0.722818  0.658396
IBM   0.446719  1.000000  0.530955  0.493972
MSFT  0.722818  0.530955  1.000000  0.771015
GOOG  0.658396  0.493972  0.771015  1.000000
--------------------------------------------
          AAPL       IBM      MSFT      GOOG
AAPL  0.000364  0.000140  0.000238  0.000211
IBM   0.000140  0.000269  0.000150  0.000136
MSFT  0.000238  0.000150  0.000299  0.000224
GOOG  0.000211  

#### 5.3.2 유일값, 값 세기, 멤버십

In [110]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

# unique 메서드 - 중복되지 않은 유일값 출력
uniques = obj.unique()
print(uniques)
print('--------------------------------------------')

# value_counts 메서드 - 빈도수를 계산
print(obj.value_counts())
print('--------------------------------------------')

# isin 메서드 - 존재 여부 반환
mask = obj.isin(['b', 'c'])
print(mask)
print('--------------------------------------------')

print(obj[mask])
print('--------------------------------------------')

['c' 'a' 'd' 'b']
--------------------------------------------
c    3
a    3
b    2
d    1
dtype: int64
--------------------------------------------
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
--------------------------------------------
0    c
5    b
6    b
7    c
8    c
dtype: object
--------------------------------------------


#### 유일값, 값 세기, 멤버십 메서드 - isin(포함 여부 반환), match(유일값을 담고 있는 배열에서의 정수 색인 계산), unique(유일값 반환), values_counts(빈도 계산)