# Getting Started with pandas

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))

In [5]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## Introduction to pandas Data Structure

### Series

In [6]:
# Series 생성
obj = pd.Series([4, 7, -5, 3], index=['a','b','c','d'])
obj

a    4
b    7
c   -5
d    3
dtype: int64

In [7]:
# Series values 와 index 조회
obj.values, obj.index

(array([ 4,  7, -5,  3], dtype=int64),
 Index(['a', 'b', 'c', 'd'], dtype='object'))

In [8]:
# Null 조회
obj.isnull(), obj.notnull()

(a    False
 b    False
 c    False
 d    False
 dtype: bool,
 a    True
 b    True
 c    True
 d    True
 dtype: bool)

In [9]:
# Series의 name 과 index name 설정
obj.name = 'test name'
obj.index.name = 'ind'
obj

ind
a    4
b    7
c   -5
d    3
Name: test name, dtype: int64

### DataFrame

In [10]:
# Dict를이용한 DataFrame 생성
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [11]:
df.head(), df.tail()

(    state  year  pop
 0    Ohio  2000  1.5
 1    Ohio  2001  1.7
 2    Ohio  2002  3.6
 3  Nevada  2001  2.4
 4  Nevada  2002  2.9,
     state  year  pop
 1    Ohio  2001  1.7
 2    Ohio  2002  3.6
 3  Nevada  2001  2.4
 4  Nevada  2002  2.9
 5  Nevada  2003  3.2)

In [12]:
# Dict data를 이용하여 DataFrame을 만들고 컬럼의 순서를 재조정
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [13]:
# Dict 및 List를 이용한 DataFrame 생성
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

df2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [14]:
# DataFrame의 column 삭제
# df.copy() 를 명확히 알고 사용하여야 한다
df2['eastern'] = df2.state == 'Ohio'
df2_1 = df2.copy()                   # copy() 사용 여부에 따라 결과값이 달라짐
del df2_1['eastern']
display('df2', 'df2_1')


Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,,False

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [15]:
df2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,,False


In [16]:
# Dict를 이용해 컬럼 및 인덱스를 포함하여 DataFrame 생성
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df3 = pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [17]:
# DataFrame의 index와 columns 조회
df2.index, df2.columns

(Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object'),
 Index(['year', 'state', 'pop', 'debt', 'eastern'], dtype='object'))

In [18]:
# DataFrame의 index 이름과 columns 이름 설정
df3.index.name = 'year'
df3.columns.name = 'state'
df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [19]:
# DataFrame 행과 열 전환
display("df2", "df2.T")

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,,False

Unnamed: 0,one,two,three,four,five,six
year,2000,2001,2002,2001,2002,2003
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,,,,,,
eastern,True,True,True,False,False,False


In [20]:
# DataFramd Data 조회 - 4가지 방법
df2['state'], df2.year, df2.loc['three'], df2.iloc[0]

(one        Ohio
 two        Ohio
 three      Ohio
 four     Nevada
 five     Nevada
 six      Nevada
 Name: state, dtype: object,
 one      2000
 two      2001
 three    2002
 four     2001
 five     2002
 six      2003
 Name: year, dtype: int64,
 year       2002
 state      Ohio
 pop         3.6
 debt        NaN
 eastern    True
 Name: three, dtype: object,
 year       2000
 state      Ohio
 pop         1.5
 debt        NaN
 eastern    True
 Name: one, dtype: object)

### Index Object

In [21]:
# Series index 처리
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
index[1:]

Index(['b', 'c'], dtype='object')

In [22]:
# pd.Index()를 이용한 인덱스 Object 생성
labels = pd.Index(np.arange(3))
print(labels)
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
print(obj2)
obj2.index is labels

Int64Index([0, 1, 2], dtype='int64')
0    1.5
1   -2.5
2    0.0
dtype: float64


True

In [23]:
# index, columns 활용
print('Ohio' in df3.columns)
print(2003 in df3.index)
df3

True
False


state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [24]:
# index의 중복 허용
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

### ReIndexing

In [25]:
# Series ReIndexing - index 순서 재조정, 없는경우 NaN
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [26]:
# pd.reindex() - method='ffill'
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3.reindex(range(6), method='ffill')

0      blue
2    purple
4    yellow
dtype: object


0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [27]:
# DataFrame reindex() - index 재조정
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
df2 = df.reindex(['a', 'b', 'c', 'd'])
display('df', 'df2')

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [28]:
# DataFrame reindex() - columns 재조정
states = ['Texas', 'Utah', 'California']
df.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


### Dropping Entries from an Axis

In [29]:
# Series Data Drop
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj.drop(['e', 'd'])

a    0.0
b    1.0
dtype: float64

In [30]:
# DataFrame 행(row, index) 삭제
df = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

df1 = df.drop(['Colorado', 'Ohio'])
display('df', 'df1')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [31]:
# DataFrame 열(columns) 삭제
df1 = df.drop('two', axis=1)
display('df', 'df1')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [32]:
df1 = df.drop(['two', 'four'], axis='columns')
display('df', 'df1')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [33]:
# 원본 DataFrame 을 직접 변경 drop(, inplace=True)
df1 = df.copy()
df.drop('one', axis=1)
df1.drop('one', axis=1, inplace=True)
display('df', 'df1')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


### Indexing, Selection, and Filtering

In [34]:
# Series 조회
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])

1.0
1.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64


In [35]:
# DataFrame 조회
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
print(data)
print('~'*40)
print(data['two'])
print('~'*40)
print(data[['three', 'one']])
print('~'*40)
print(data[0:2])
print('~'*40)
print(data[data['three'] > 5])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


### Selection with loc and iloc

In [36]:
print(data)
print('~'*40)
print(data.loc['Colorado', ['two', 'three']])
print('~'*40)
print(data.iloc[2, [3, 0, 1]])
print('~'*40)
print(data.iloc[2])
print('~'*40)
print(data.iloc[[1, 2], [3, 0, 1]])


          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
two      5
three    6
Name: Colorado, dtype: int32
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
four    11
one      8
two      9
Name: Utah, dtype: int32
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
one       8
two       9
three    10
four     11
Name: Utah, dtype: int32
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          four  one  two
Colorado     7    4    5
Utah        11    8    9


In [37]:
print(data.loc[:'Utah', 'two'])
print('~'*40)
print(data.iloc[:, :3][data.three > 5])

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          one  two  three
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14


### Integer Indexes

In [38]:
# Label 과 Inter 가 동일한 경우 먼저 Label을 찾도록 설계되어 있음
ser = pd.Series(np.arange(3.))
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

print(ser[:1])
print('~'*40)
print(ser.loc[:1])   # Label Indexing
print('~'*40)
print(ser.iloc[:1])  # Inter Indexing

0    0.0
dtype: float64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0    0.0
1    1.0
dtype: float64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0    0.0
dtype: float64


### Arithmetic and Data Alignment(산술연산과 데이터 정렬)

In [39]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
print(s1)
print('~'*40)
print(s2)
print('~'*40)
print(s1+s2)


a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64


In [40]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df3 = df1 + df2
display('df1', 'df2', 'df3')

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [41]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df3 = df1 - df2
display('df1','df2','df3')

Unnamed: 0,A
0,1
1,2

Unnamed: 0,B
0,3
1,4

Unnamed: 0,A,B
0,,
1,,


### Arithmetic methods with fill values : 산술 연산 메서드에 채워 넣을 값 지정하기

In [42]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
display('df1','df2')

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [43]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [44]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [45]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [46]:
print(df1)
df1.rdiv(1)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0


Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [47]:
print(df1)
print('~'*40)
print(df1.sub(1))
print('~'*40)
print(df1.rsub(1))

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     a    b    c     d
0 -1.0  0.0  1.0   2.0
1  3.0  4.0  5.0   6.0
2  7.0  8.0  9.0  10.0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     a    b    c     d
0  1.0  0.0 -1.0  -2.0
1 -3.0 -4.0 -5.0  -6.0
2 -7.0 -8.0 -9.0 -10.0


In [48]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


### Operations between DataFrame and Series

In [61]:
# Index가 동일한 경우 산술 연산
df = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = df.iloc[0]
df - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [62]:
df

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [63]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [55]:
# Index가 다른 경우 산술 연산
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
df + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [57]:
series3 = df['d']
series3
df.sub(series3, axis='index')

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [58]:
df

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [59]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [64]:

df.sub(series3)

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [65]:
# 무지 헤깔리는 axis
df.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### Function Application and Mapping

In [67]:
df = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df

Unnamed: 0,b,d,e
Utah,0.274992,0.228913,1.352917
Ohio,0.886429,-2.001637,-0.371843
Texas,1.669025,-0.43857,-0.539741
Oregon,0.476985,3.248944,-1.021228


In [68]:
# np.abs : 절대값
np.abs(df)

Unnamed: 0,b,d,e
Utah,0.274992,0.228913,1.352917
Ohio,0.886429,2.001637,0.371843
Texas,1.669025,0.43857,0.539741
Oregon,0.476985,3.248944,1.021228


In [69]:
f = lambda x: x.max() - x.min()
df.apply(f)

b    1.394034
d    5.250581
e    2.374144
dtype: float64

In [70]:
df.apply(f, axis='columns')

Utah      1.124004
Ohio      2.888067
Texas     2.208767
Oregon    4.270171
dtype: float64

In [79]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

df_1 = df.apply(f)
df_2 = df.apply(f, axis=1)
display('df','df_1','df_2')

Unnamed: 0,b,d,e
Utah,0.274992,0.228913,1.352917
Ohio,0.886429,-2.001637,-0.371843
Texas,1.669025,-0.43857,-0.539741
Oregon,0.476985,3.248944,-1.021228

Unnamed: 0,b,d,e
min,0.274992,-2.001637,-1.021228
max,1.669025,3.248944,1.352917

Unnamed: 0,min,max
Utah,0.228913,1.352917
Ohio,-2.001637,0.886429
Texas,-0.539741,1.669025
Oregon,-1.021228,3.248944


In [74]:
df

Unnamed: 0,b,d,e
Utah,0.274992,0.228913,1.352917
Ohio,0.886429,-2.001637,-0.371843
Texas,1.669025,-0.43857,-0.539741
Oregon,0.476985,3.248944,-1.021228


In [80]:
# 요소별 함수의 적용
# applymap() - DataFrame 사용, map() - Series에서 사용
format = lambda x: '%.2f' % x
df_1 = df.applymap(format)
display('df','df_1')

Unnamed: 0,b,d,e
Utah,0.274992,0.228913,1.352917
Ohio,0.886429,-2.001637,-0.371843
Texas,1.669025,-0.43857,-0.539741
Oregon,0.476985,3.248944,-1.021228

Unnamed: 0,b,d,e
Utah,0.27,0.23,1.35
Ohio,0.89,-2.0,-0.37
Texas,1.67,-0.44,-0.54
Oregon,0.48,3.25,-1.02


In [76]:
df['e'].map(format)

Utah       1.35
Ohio      -0.37
Texas     -0.54
Oregon    -1.02
Name: e, dtype: object

### Sorting and Ranking

In [81]:
# sort_index() - index를 기준으로 sort
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [85]:
df = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
df_1 = df.sort_index()
df_2 = df.sort_index(axis=1)
df_3 = df.sort_index(axis=1, ascending=False)
display('df','df_1','df_2','df_3')

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [86]:
# Series - sort_values()
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [89]:
df = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
df_1 = df.sort_values(by='b')
df_2 = df.sort_values(by=['a', 'b'])
display('df','df_1','df_2')



Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


### Ranking
**rank()** - ***특정변수를 기준으로 순위를 구함***

#### 동점 처리 방법(tie-breaking methods)
1. 평균(method='average'), default
2. 최소값(method='min')
3. 최대값(method='max')
4. 첫번째값(method='first')
5. 조밀하게(method='dense')

In [94]:
# rank() - 특정변수를 기준으로 순위를 구함
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [93]:
obj.rank(method='min')

0    6.0
1    1.0
2    6.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64

In [96]:
# 높은 값일 수록 1순위 - rank()-ascending
obj.rank(ascending=False, method='min')

0    1.0
1    7.0
2    1.0
3    3.0
4    5.0
5    6.0
6    3.0
dtype: float64

In [99]:
df = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
df_1 = df.rank(method='first')
df_2 = df.rank(method='first', axis='columns')
display('df','df_1','df_2')

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5

Unnamed: 0,b,a,c
0,3.0,1.0,2.0
1,4.0,3.0,3.0
2,1.0,2.0,4.0
3,2.0,4.0,1.0

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### Axis Indexes with Duplicate Labels

In [102]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj.index.is_unique)
print('~'*40)
print(obj['a'])
print('~'*40)
print(obj['c'])

False
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a    0
a    1
dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4


In [104]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df.loc['b']

Unnamed: 0,0,1,2
b,-1.541996,-0.970736,-1.30703
b,0.28635,0.377984,-0.753887


## Summarizing and Computing Descriptive Statistics

In [117]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
print(df)
print('~'*40)
print(df.sum())
print('~'*40)
print(df.sum(axis='columns'))
print('~'*40)

# 최대값을 가지는 index 출력
# Series는 index, DataFrame은 index, columns
print(df.idxmax())
print('~'*40)
print(df.cumsum())

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
one    9.25
two   -5.80
dtype: float64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
one    b
two    d
dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    one  two
a  1.40  NaN
b  8.50 -4.5
c   NaN  NaN
d  9.25 -5.8


In [119]:
## Correlation and Covariance

In [123]:
price = pd.read_pickle('yahoo_price.pkl')
volume = pd.read_pickle('yahoo_volume.pkl')

In [129]:
# 수익률 계산
# return(%) = (현재가격 - 어제가격)/어제가격 * 100
returns = price.pct_change()

In [134]:
# pd.corr() - 데이터간 상관관계
# method = 'pearson'
#        = 'kendall'
#        = 'spearman'
returns['MSFT'].corr(returns['IBM'])

0.4997636114415114

In [132]:
returns['MSFT'].cov(returns['IBM'])

8.870655479703546e-05

In [141]:
import seaborn as sns 
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [135]:
# 공분산(Covariance)
# 2개의 확률변수의 상관정도를 나타내는 값
returns['MSFT'].cov(returns['IBM'])


8.870655479703546e-05

In [142]:
# 지정한 변수와 모든 변수간 상관계수
returns.corrwith(returns.IBM)

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [143]:
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

In [145]:
returns

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,,,,
2010-01-05,0.001729,-0.004404,-0.012080,0.000323
2010-01-06,-0.015906,-0.025209,-0.006496,-0.006137
2010-01-07,-0.001849,-0.023280,-0.003462,-0.010400
2010-01-08,0.006648,0.013331,0.010035,0.006897
...,...,...,...,...
2016-10-17,-0.000680,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.007690
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867


In [144]:
volume

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400
...,...,...,...,...
2016-10-17,23624900,1089500,5890400,23830000
2016-10-18,24553500,1995600,12770600,19149500
2016-10-19,20034600,116600,4632900,22878400
2016-10-20,24125800,1734200,4023100,49455600


### Unique Values, Value Counts, and Membership

In [147]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj.unique())
print(obj.value_counts())

['c' 'a' 'd' 'b']
a    3
c    3
b    2
d    1
dtype: int64


In [150]:
obj
mask = obj.isin(['b', 'c'])
mask
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [151]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [154]:
# DataFrame 에서 value_counts()
df = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
df_1 = df.apply(pd.value_counts).fillna(0)
display('df','df_1')                     

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
