# Pandas
## Series
- 데이터 테이블: 정형화된 데이터의 모습, 인덱스, feature 갖고 있음
- Series: 일련의 객체를 담을 수 있는 1차원 배열, array, index, dtype 속성을 통해 확인 가능

In [77]:
import pandas as pd
personal_info = pd.Series(index=["name", "gender", "age"])

- name 속성
- 색인 교체, 데이터 추가는 dictionary와 비슷하게 가능

## DataFrame 생성

In [78]:
from pandas import Series, DataFrame

In [79]:
# 동일한 길이의 리스트가 value로 담긴 딕셔너리 -> DataFrame으로 생성 가능
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada", ],
        "year": [2000, 2001, 2002, 2003, 2004, 2005],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

# data를 데이터프레임으로 변환
frame = DataFrame(data)

In [80]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2003,2.4
4,Nevada,2004,2.9
5,Nevada,2005,3.2


## head(), tail()

- head(): 상위 5개 인덱스의 데이터를 보여줌
- tail(): 하위 5개 인덱스의 데이터를 보여줌

In [81]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2003,2.4
4,Nevada,2004,2.9


In [82]:
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2003,2.4
4,Nevada,2004,2.9
5,Nevada,2005,3.2


## Feature 선택적으로 생성

In [83]:
DataFrame(data, columns=["year", "state"])

Unnamed: 0,year,state
0,2000,Ohio
1,2001,Ohio
2,2002,Ohio
3,2003,Nevada
4,2004,Nevada
5,2005,Nevada


## 없는 Feature 선택 시 결측치로 생성

In [84]:
# data 딕셔너리에 debt 키는 없기 때문에 결측치(NaN)로 생성됨
frame2 = DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2003,Nevada,2.4,
4,2004,Nevada,2.9,
5,2005,Nevada,3.2,


## Feature 확인, Feature별 데이터 호출

In [85]:
# index를 마음대로 변경 가능, 하지만 함부로 바꾸면 data가 망가지기 때문에 최대한 하지 않는 것이 좋음
# 각 샘플 호출
frame2.index

RangeIndex(start=0, stop=6, step=1)

In [86]:
# 각 feature 이름
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [87]:
# DataFrame에서 특정 column 호출 -> Series 객체
frame2["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [88]:
frame2.year

0    2000
1    2001
2    2002
3    2003
4    2004
5    2005
Name: year, dtype: int64

## 특정 Feature 데이터는 Series 객체

In [89]:
type(frame2)

pandas.core.frame.DataFrame

In [90]:
type(frame2["year"])

pandas.core.series.Series

# Feature 수정

In [91]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2003,2.4
4,Nevada,2004,2.9
5,Nevada,2005,3.2


In [92]:
frame["debt"] = 16.5
frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,16.5
1,Ohio,2001,1.7,16.5
2,Ohio,2002,3.6,16.5
3,Nevada,2003,2.4,16.5
4,Nevada,2004,2.9,16.5
5,Nevada,2005,3.2,16.5


In [93]:
frame["state"] == "Ohio"

0     True
1     True
2     True
3    False
4    False
5    False
Name: state, dtype: bool

In [94]:
frame["eastern"] = frame["state"] == "Ohio"
frame

Unnamed: 0,state,year,pop,debt,eastern
0,Ohio,2000,1.5,16.5,True
1,Ohio,2001,1.7,16.5,True
2,Ohio,2002,3.6,16.5,True
3,Nevada,2003,2.4,16.5,False
4,Nevada,2004,2.9,16.5,False
5,Nevada,2005,3.2,16.5,False


In [95]:
val = Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
# val = Series([-1.2, -1.5, -1.7], index=[2, 4, 5])
frame["debt"] = val # 해당 index가 없기 때문에 debt의 값이 결측치로 처리
frame

Unnamed: 0,state,year,pop,debt,eastern
0,Ohio,2000,1.5,,True
1,Ohio,2001,1.7,,True
2,Ohio,2002,3.6,,True
3,Nevada,2003,2.4,,False
4,Nevada,2004,2.9,,False
5,Nevada,2005,3.2,,False


## Feature 삭제

In [96]:
del frame["eastern"]
frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,
3,Nevada,2003,2.4,
4,Nevada,2004,2.9,
5,Nevada,2005,3.2,


In [97]:
frame.pop("debt")

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: debt, dtype: float64

In [98]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2003,2.4
4,Nevada,2004,2.9
5,Nevada,2005,3.2


# 중첩 딕셔너리 활용 DataFrame 생성
## 동일한 길이로 value가 이루어진 딕셔너리 -> DataFrame
### 결측치 X

In [99]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada", ],
        "year": [2000, 2001, 2002, 2003, 2004, 2005],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

DataFrame(data)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2003,2.4
4,Nevada,2004,2.9
5,Nevada,2005,3.2


## 딕셔너리 리스트 -> DataFrame
### 같은 길이의 리스트를 가질 필요가 없음
- 각 딕셔너리가 샘플이 됨
- 키의 합집합이 feature 이름(결측치 생성 가능성)

In [100]:
data1 = {"state": "Ohio",
         "year": 2000,
         "pop": 1.5}
data2 = {"state": "Mass",
         "year": 2000,
         "GDP": 1.8}
data3 = {"year": 2011,
         "pop": 1.1}

In [101]:
DataFrame([data1, data2, data3])

Unnamed: 0,state,year,pop,GDP
0,Ohio,2000,1.5,
1,Mass,2000,,1.8
2,,2011,1.1,


## 딕셔너리들의 딕셔너리 -> DataFrame
### index 이름 지정 가능
- 바깥쪽 키가 feature 이름
- 안쪽 키의 합집합이 index(결측치 생성 가능성)

In [102]:
# 외부 딕셔너리의 키가 feature 이름
# 내부 딕셔너리에서 키의 합집합이 index가 됨
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}

In [103]:
frame3 = DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


## name 속성
- index name과 feature(columns) name 속성

In [104]:
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [105]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

## Reindex(재색인)
### 기존 색인-feature 관계 유지, 새롭게 색인
### 순서나 원하는 데이터만 뽑아서 reindex

In [106]:
obj = Series([4.5, 7.2, -5.3, -3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c   -3.6
dtype: float64

In [107]:
obj.index = ["a", "b", "c", "d"]
obj     # 원래 index-feature 사이의 관계가 깨짐(index 새로 assign)

a    4.5
b    7.2
c   -5.3
d   -3.6
dtype: float64

### reindex는 기존 index-feature 사이 관계 유지

In [108]:
obj2 = obj.reindex(["a", "c", "d", "b", "e"])
obj2    # 각각의 index에 매칭되어 있던 값이 유지

a    4.5
c   -5.3
d   -3.6
b    7.2
e    NaN
dtype: float64

In [109]:
obj3 = Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [110]:
import numpy as np
obj3.reindex(np.arange(6))

0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

In [111]:
# method로 결측치 처리
# ffill: 결측치 발생 시 이전의 값으로 채움
obj3.reindex(np.arange(6), method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [112]:
# bfill: 결측치 발생 시 이후의 값으로 채움
obj3.reindex(np.arange(6), method="bfill")

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

In [113]:
# fill_value: 해당 값을 원하는 값으로 채울 수 있음
obj3.reindex(np.arange(6), fill_value=777)

0      blue
1       777
2    purple
3       777
4    yellow
5       777
dtype: object

## DataFame에서의 재색인

In [114]:
frame = DataFrame(np.arange(9).reshape(3, 3),
                  index=["a", "c", "d"],
                  columns=["Ohio", "Texas", "California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [115]:
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [116]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)   # index에서 reindex 할지, columns에서 reindex 할지 정의해야 함

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


### axis = "columns" OR axis = "index"로 reindex 방향 적용 가능(1 OR 0으로도 가능)

In [117]:
frame.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [118]:
frame.reindex(states, axis=1)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


- name, gender, age를 index로 하는 personal_info 이름의 Series가 있을 때, reindex를 활용하여, 그 순서를 gender,name,age로 바꾸세요

- frame의 index를 a와 d만 남겨놓으세요

In [119]:
personal_info = Series(["Min Kyoung", "F", 24], index=["name", "gender", "age"])
personal_info

name      Min Kyoung
gender             F
age               24
dtype: object

In [120]:
personal_info_re = personal_info.reindex(["gender", "name", "age"])
personal_info_re

gender             F
name      Min Kyoung
age               24
dtype: object

In [121]:
frame_re = frame.reindex(["a", "d"], axis=0)
frame_re

Unnamed: 0,Ohio,Texas,California
a,0,1,2
d,6,7,8


## 행/열 삭제(drop)
- del나 pop과 다르게 DataFrame이나 Series를 변경하지 않음

In [122]:
obj = Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [123]:
obj.drop(index=["a", "b"])

c    2.0
d    3.0
e    4.0
dtype: float64

## 색인, 선택, 필터(loc, iloc)
### Series
#### numpy 배열처럼 색인 가능
- 1차원 numpy 배열처럼 취급
- boolean 배열 활용 -> 색인, 팬시 색인 적용 가능
- index 이름, index 순서로도 색인 가능 -> 혼란 발생 가능성

In [124]:
obj = Series(np.arange(4.), index=["a", "b", "c", "d"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [125]:
obj[0]  # Series는 순서로 index 접근 가능

  obj[0]  # Series는 순서로 index 접근 가능


0.0

In [126]:
obj["a"]

0.0

In [127]:
obj[["d", "b"]]

d    3.0
b    1.0
dtype: float64

In [128]:
obj[[3, 1]]

  obj[[3, 1]]


d    3.0
b    1.0
dtype: float64

In [129]:
obj[obj < 2]    # 각각의 요소에 조건부 연산 적용

a    0.0
b    1.0
dtype: float64

- 정수 색인과 정수인 색인 사이의 혼란 가능성

In [130]:
obj = Series(np.arange(4.), index=["a", 2, 1, 0])

In [131]:
obj[0]

3.0

In [132]:
obj[3]

KeyError: 3

### loc과 iloc

In [133]:
obj

a    0.0
2    1.0
1    2.0
0    3.0
dtype: float64

In [134]:
obj.loc

<pandas.core.indexing._LocIndexer at 0x1d20bab1400>

In [135]:
obj.iloc

<pandas.core.indexing._iLocIndexer at 0x1d20ba62f80>

In [136]:
obj.loc["a"], obj.loc[2], obj.loc[1], obj.loc[0]    # index label만 고려

(0.0, 1.0, 2.0, 3.0)

In [137]:
obj.iloc[0], obj.iloc[1], obj.iloc[2], obj.iloc[3]  # iloc에서 i가 아마 integer

(0.0, 1.0, 2.0, 3.0)

### DataFrame
- columns 이름을 통해 색인 가능

In [138]:
data = DataFrame(np.arange(16).reshape(4, 4),
                 index=["Ohio", "Colorado", "Utah", "New York"],
                 columns=["one", "two", "three", "four"])

In [139]:
data.loc["Colorado"]

one      4
two      5
three    6
four     7
Name: Colorado, dtype: int32

In [140]:
data.loc[["Colorado", "New York"]]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15


In [141]:
data.loc["Colorado", ["two", "three"]]

two      5
three    6
Name: Colorado, dtype: int32

In [142]:
data.loc[["Colorado"], ["two", "three"]]

Unnamed: 0,two,three
Colorado,5,6


In [143]:
data.iloc[1:2]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7


In [144]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [145]:
data.iloc[:, 1]

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [146]:
data.iloc[[2, 1]]   # 팬시 색인 적용 가능

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,4,5,6,7


In [147]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### 레이블의 슬라이싱 -> 레이블도 포함

In [148]:
data.loc[:"Utah", "two"]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [149]:
data.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [150]:
data.loc[data.three > 5]    # boolean 객체를 포함한 인덱싱

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [151]:
data.iloc[data.three > 5]   # iloc에서는 boolean 객체를 포함한 인덱싱 적용 불가

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

- 아래 두 Series가 있을 때, 각각에 대해서 각각 loc과 iloc을 써서 [3,2,1]을 value로 갖는 Series를 반환받으시오.

obj1 = Series([1, 2, 3], index=[2, 0, 1])

obj2 = Series([1, 2, 3], index=[“c", "b", “a"])

- data 데이터프레임에 대해서, columns의 three가 7보다 큰 주에 대해서만 one과 two를 호출하시오 (loc을 사용)

In [152]:
obj1 = Series([1, 2, 3], index=[2, 0, 1])
obj2 = Series([1, 2, 3], index=["c", "b", "a"])

In [153]:
obj1

2    1
0    2
1    3
dtype: int64

In [154]:
obj1.loc[[1, 0, 2]]

1    3
0    2
2    1
dtype: int64

In [155]:
obj1.iloc[[2, 1, 0]]

1    3
0    2
2    1
dtype: int64

In [156]:
obj2

c    1
b    2
a    3
dtype: int64

In [157]:
obj2.loc[["a", "b", "c"]]

a    3
b    2
c    1
dtype: int64

In [158]:
obj2.iloc[[2, 1, 0]]

a    3
b    2
c    1
dtype: int64

## 산술 연산과 데이터 정렬

In [159]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1],
            index=["a", "c", "e", "f", "g"])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [160]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [161]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [162]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"),
                   index=["Ohio", "Texas", "Colorado"])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                   index=["Utah", "Ohio", "Texas", "Oregon"])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [163]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [164]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [165]:
df1 = DataFrame({"A": [1, 2]})
df2 = DataFrame({"B": [3, 4]})
df1

Unnamed: 0,A
0,1
1,2


In [166]:
df2

Unnamed: 0,B
0,3
1,4


In [167]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


In [168]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list("abcd"))
df2 = DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list("abcde"))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [169]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [170]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [171]:
df1.add(df2, fill_value=0)  # fill_value: 결측치가 있는 데이터와 연산할 때 결측치를 바꿔줄 숫자

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [172]:
df1.add(df2, fill_value=100)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,104.0
1,9.0,11.0,13.0,15.0,109.0
2,18.0,20.0,22.0,24.0,114.0
3,115.0,116.0,117.0,118.0,119.0


In [173]:
df1.rsub(df2, fill_value=100)   # sub과 부호 반전

Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,-96.0
1,1.0,1.0,1.0,1.0,-91.0
2,2.0,2.0,2.0,2.0,-86.0
3,-85.0,-84.0,-83.0,-82.0,-81.0


## DataFrame과 Series 간 연산
numpy 배열의 연산을 따름

In [174]:
frame = DataFrame(np.arange(12.).reshape(4, 3),
                  columns=list("bde"),
                  index=["Utah", "Ohio", "Texas", "Oregon"])
series = frame.iloc[0]

In [175]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [176]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [177]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [178]:
series2 = Series(np.arange(3), index=["b", "e", "f"])
series2

b    0
e    1
f    2
dtype: int32

In [179]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [180]:
series3 = frame["d"]
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [181]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [182]:
frame.sub(series, axis="index")

Unnamed: 0,b,d,e
Ohio,,,
Oregon,,,
Texas,,,
Utah,,,
b,,,
d,,,
e,,,


In [183]:
frame.sub(series, axis="columns")

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [184]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [185]:
frame.sub(series3, axis="columns")  # column 방향향

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [186]:
frame.sub(series3, axis="index")    # index 방향

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


## 함수 적용과 맵핑
### universal 함수가 적용될 수 있음
### apply < 중요 (행 또는 열을 따라 적용)

In [187]:
frame = DataFrame(np.random.standard_normal((4, 3)),
                  columns=list("bde"),
                  index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,0.949946,-1.220438,0.807797
Ohio,0.034462,1.294066,0.74607
Texas,0.671847,0.106821,0.116829
Oregon,-0.319899,-2.075331,0.121328


In [188]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.949946,1.220438,0.807797
Ohio,0.034462,1.294066,0.74607
Texas,0.671847,0.106821,0.116829
Oregon,0.319899,2.075331,0.121328


In [189]:
def f1(x):
    return x.max() - x.min()

frame.apply(f1)

b    1.269845
d    3.369397
e    0.690968
dtype: float64

In [190]:
frame.apply(lambda x: x.max() - x.min())

b    1.269845
d    3.369397
e    0.690968
dtype: float64

In [191]:
frame.apply(f1, axis="index")   # 기본 방향: index

b    1.269845
d    3.369397
e    0.690968
dtype: float64

In [192]:
frame.apply(f1, axis="columns") # column 방향을 따라 함수 적용

Utah      2.170384
Ohio      1.259605
Texas     0.565026
Oregon    2.196659
dtype: float64

In [193]:
frame

Unnamed: 0,b,d,e
Utah,0.949946,-1.220438,0.807797
Ohio,0.034462,1.294066,0.74607
Texas,0.671847,0.106821,0.116829
Oregon,-0.319899,-2.075331,0.121328


In [194]:
def f2(x):
    return Series([x.min(), x.max()], index=["min", "max"])
type(frame.apply(f2))

pandas.core.frame.DataFrame

In [195]:
frame.apply(f2) # 기본 axis: index

Unnamed: 0,b,d,e
min,-0.319899,-2.075331,0.116829
max,0.949946,1.294066,0.807797


### applymap: DataFrame의 개별 요소에 대해 동일한 함수 적용할 때 사용
### map: Series의 개별 요소에 대해 동일한 함수를 적용할 때 사용

In [196]:
s1 = frame["b"]
s1

Utah      0.949946
Ohio      0.034462
Texas     0.671847
Oregon   -0.319899
Name: b, dtype: float64

In [197]:
s1.map(lambda x: 2*x)   # map: Series의 개별 요소에 대해 동일한 함수 적용

Utah      1.899892
Ohio      0.068924
Texas     1.343695
Oregon   -0.639798
Name: b, dtype: float64

In [198]:
frame

Unnamed: 0,b,d,e
Utah,0.949946,-1.220438,0.807797
Ohio,0.034462,1.294066,0.74607
Texas,0.671847,0.106821,0.116829
Oregon,-0.319899,-2.075331,0.121328


In [199]:
frame.applymap(lambda x: 2*x)   # applymap: DataFrame의 개별 요소에 대해 동일한 함수 적용

  frame.applymap(lambda x: 2*x)   # applymap: DataFrame의 개별 요소에 대해 동일한 함수 적용


Unnamed: 0,b,d,e
Utah,1.899892,-2.440875,1.615594
Ohio,0.068924,2.588133,1.49214
Texas,1.343695,0.213643,0.233657
Oregon,-0.639798,-4.150662,0.242656


frame의 각 주 별로, b*d*e를 계산하여 반환하시오 (apply활용)

In [200]:
frame.apply(lambda x: x.b * x.d * x.e, axis='columns')

Utah     -0.936519
Ohio      0.033272
Texas     0.008385
Oregon    0.080549
dtype: float64

## 정렬과 순위

In [201]:
obj = Series([4, 7, -3, 2])

In [202]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [205]:
obj = Series([4, np.nan, 7, -3, 2])

In [207]:
obj.sort_values(na_position='first')    # 기본은 last

1    NaN
3   -3.0
4    2.0
0    4.0
2    7.0
dtype: float64

In [209]:
frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})

In [210]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [211]:
frame.sort_values(["a"])

Unnamed: 0,b,a
0,4,0
2,-3,0
1,7,1
3,2,1


In [212]:
frame.sort_values(["b"])    # 어떤 열을 기준으로 정렬할지 넣어줘야 함함

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


### rank 메서드는 순위를 알려주는 메서드

In [213]:
obj = Series([7, -5, 7, 4, 2, 0, 4])

In [214]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

## 넘파이 기반 함수/메서드 제공

In [215]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])

In [216]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [217]:
df.sum()    # 기본: 세로 방향(axis='index')

one    9.25
two   -5.80
dtype: float64

In [218]:
df.sum(axis='columns')  # 가로 방향(axis='columns'), 결측치는 0으로 취급

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [219]:
df.sum(axis='index', skipna=False)  # 결측치를 결측치로 표시(NaN)

one   NaN
two   NaN
dtype: float64

In [220]:
df.mean(axis='columns')

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [221]:
df.describe()   # 각각의 feature 별 통계치 반환

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## 상관관계와 공분산

In [226]:
price = pd.read_pickle("E:/학교/4-2/빅데이터분석/4주차/yahoo_price.pkl")
volume = pd.read_pickle("E:/학교/4-2/빅데이터분석/4주차/yahoo_volume.pkl")

In [227]:
price

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571
...,...,...,...,...
2016-10-17,117.550003,779.960022,154.770004,57.220001
2016-10-18,117.470001,795.260010,150.720001,57.660000
2016-10-19,117.120003,801.500000,151.259995,57.529999
2016-10-20,117.059998,796.969971,151.520004,57.250000


In [228]:
volume

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400
...,...,...,...,...
2016-10-17,23624900,1089500,5890400,23830000
2016-10-18,24553500,1995600,12770600,19149500
2016-10-19,20034600,116600,4632900,22878400
2016-10-20,24125800,1734200,4023100,49455600


In [229]:
returns = price.pct_change()    # 변화를 퍼센트로 나타내는 메서드

In [230]:
returns

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,,,,
2010-01-05,0.001729,-0.004404,-0.012080,0.000323
2010-01-06,-0.015906,-0.025209,-0.006496,-0.006137
2010-01-07,-0.001849,-0.023280,-0.003462,-0.010400
2010-01-08,0.006648,0.013331,0.010035,0.006897
...,...,...,...,...
2016-10-17,-0.000680,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.007690
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867


In [231]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [233]:
returns["MSFT"].corr(returns["IBM"])    # 인수와의 상관관계(correlation)를 나타내는 메서드

0.4997636114415114

In [234]:
returns["MSFT"].cov(returns["IBM"]) # 인수와의 공분산(covariance)을 나타내는 메서드

8.870655479703545e-05

In [235]:
# 데이터 프레임에 corr, cov 적용
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [236]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


# reindex, loc, iloc 중요!!
예제 < 이거 시험에 나올 수도 있다~
# 함수 적용, 매핑(apply, applymap)