In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [103]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [104]:
dates = pd.date_range('20130101', periods=6)
dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [105]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,-1.906958,-0.723114,-0.96995,1.134474
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-03,1.126284,0.544805,-1.23135,0.717632
2013-01-04,0.405992,0.232228,0.477219,-0.187534
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425
2013-01-06,-0.144996,0.489903,0.687375,-0.127812


In [106]:
df2 = pd.DataFrame(
    {
        'A': 1.0,
        'B': pd.Timestamp('20130102'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3] * 4, dtype='int32'),
        'E': pd.Categorical(['test', 'train', 'test', 'train']),
        'F': 'foo',
    }
)

df2


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [107]:
df2.dtypes


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [108]:
dir(df2)


['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce_

In [109]:
df.head()


Unnamed: 0,A,B,C,D
2013-01-01,-1.906958,-0.723114,-0.96995,1.134474
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-03,1.126284,0.544805,-1.23135,0.717632
2013-01-04,0.405992,0.232228,0.477219,-0.187534
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425


In [110]:
df.tail()


Unnamed: 0,A,B,C,D
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-03,1.126284,0.544805,-1.23135,0.717632
2013-01-04,0.405992,0.232228,0.477219,-0.187534
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425
2013-01-06,-0.144996,0.489903,0.687375,-0.127812


In [111]:
df.tail(3)


Unnamed: 0,A,B,C,D
2013-01-04,0.405992,0.232228,0.477219,-0.187534
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425
2013-01-06,-0.144996,0.489903,0.687375,-0.127812


In [112]:
df.index


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [113]:
df.columns


Index(['A', 'B', 'C', 'D'], dtype='object')

In [114]:
df.values


array([[-1.90695825, -0.7231139 , -0.96994995,  1.13447401],
       [ 2.27950526, -0.6970124 ,  1.70187178, -1.11628086],
       [ 1.12628405,  0.54480462, -1.23135   ,  0.71763173],
       [ 0.40599205,  0.23222802,  0.47721941, -0.18753363],
       [ 0.06071633, -0.94456198, -0.32863077, -0.01342486],
       [-0.14499611,  0.48990285,  0.68737506, -0.12781196]])

In [115]:
df.describe()


Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.303424,-0.182959,0.056089,0.067842
std,1.395585,0.676878,1.108399,0.78386
min,-1.906958,-0.944562,-1.23135,-1.116281
25%,-0.093568,-0.716589,-0.80962,-0.172603
50%,0.233354,-0.232392,0.074294,-0.070618
75%,0.946211,0.425484,0.634836,0.534868
max,2.279505,0.544805,1.701872,1.134474


In [116]:
df.T


Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.906958,2.279505,1.126284,0.405992,0.060716,-0.144996
B,-0.723114,-0.697012,0.544805,0.232228,-0.944562,0.489903
C,-0.96995,1.701872,-1.23135,0.477219,-0.328631,0.687375
D,1.134474,-1.116281,0.717632,-0.187534,-0.013425,-0.127812


In [117]:
df.sort_index(axis=1, ascending=False)


Unnamed: 0,D,C,B,A
2013-01-01,1.134474,-0.96995,-0.723114,-1.906958
2013-01-02,-1.116281,1.701872,-0.697012,2.279505
2013-01-03,0.717632,-1.23135,0.544805,1.126284
2013-01-04,-0.187534,0.477219,0.232228,0.405992
2013-01-05,-0.013425,-0.328631,-0.944562,0.060716
2013-01-06,-0.127812,0.687375,0.489903,-0.144996


In [118]:
df.sort_index(axis=1)


Unnamed: 0,A,B,C,D
2013-01-01,-1.906958,-0.723114,-0.96995,1.134474
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-03,1.126284,0.544805,-1.23135,0.717632
2013-01-04,0.405992,0.232228,0.477219,-0.187534
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425
2013-01-06,-0.144996,0.489903,0.687375,-0.127812


In [119]:
df.sort_values(by='B', ascending=False)


Unnamed: 0,A,B,C,D
2013-01-03,1.126284,0.544805,-1.23135,0.717632
2013-01-06,-0.144996,0.489903,0.687375,-0.127812
2013-01-04,0.405992,0.232228,0.477219,-0.187534
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-01,-1.906958,-0.723114,-0.96995,1.134474
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425


In [120]:
# 3. 데이터 선택하기 (Selection)
df['A']


2013-01-01   -1.906958
2013-01-02    2.279505
2013-01-03    1.126284
2013-01-04    0.405992
2013-01-05    0.060716
2013-01-06   -0.144996
Freq: D, Name: A, dtype: float64

In [121]:
df.A


2013-01-01   -1.906958
2013-01-02    2.279505
2013-01-03    1.126284
2013-01-04    0.405992
2013-01-05    0.060716
2013-01-06   -0.144996
Freq: D, Name: A, dtype: float64

In [122]:
type(df['A'])


pandas.core.series.Series

In [123]:
df[0:3]


Unnamed: 0,A,B,C,D
2013-01-01,-1.906958,-0.723114,-0.96995,1.134474
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-03,1.126284,0.544805,-1.23135,0.717632


In [124]:
df['20130102':'20130104']


Unnamed: 0,A,B,C,D
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-03,1.126284,0.544805,-1.23135,0.717632
2013-01-04,0.405992,0.232228,0.477219,-0.187534


In [125]:
df.loc[dates[0]]


A   -1.906958
B   -0.723114
C   -0.969950
D    1.134474
Name: 2013-01-01 00:00:00, dtype: float64

In [126]:
df.loc[dates[1]]


A    2.279505
B   -0.697012
C    1.701872
D   -1.116281
Name: 2013-01-02 00:00:00, dtype: float64

In [127]:
df.loc['20130101']
df.loc['2013-01-01']


A   -1.906958
B   -0.723114
C   -0.969950
D    1.134474
Name: 2013-01-01 00:00:00, dtype: float64

In [128]:
df.loc[:, ['A', 'B']]



Unnamed: 0,A,B
2013-01-01,-1.906958,-0.723114
2013-01-02,2.279505,-0.697012
2013-01-03,1.126284,0.544805
2013-01-04,0.405992,0.232228
2013-01-05,0.060716,-0.944562
2013-01-06,-0.144996,0.489903


In [129]:
df.loc['20130102':'20130104', ['A', 'B']]


Unnamed: 0,A,B
2013-01-02,2.279505,-0.697012
2013-01-03,1.126284,0.544805
2013-01-04,0.405992,0.232228


In [130]:
df.loc['20130102':'20130104', ['A', 'B']]


Unnamed: 0,A,B
2013-01-02,2.279505,-0.697012
2013-01-03,1.126284,0.544805
2013-01-04,0.405992,0.232228


In [131]:
df.loc[dates[0], ['A', 'B']]


A   -1.906958
B   -0.723114
Name: 2013-01-01 00:00:00, dtype: float64

In [132]:
df.loc[dates[0], 'A']


-1.906958254768449

In [133]:
df.at[dates[0], 'A']


-1.906958254768449

In [134]:
df.iloc[3]


A    0.405992
B    0.232228
C    0.477219
D   -0.187534
Name: 2013-01-04 00:00:00, dtype: float64

In [135]:
df.iloc[3:5, 0:2]


Unnamed: 0,A,B
2013-01-04,0.405992,0.232228
2013-01-05,0.060716,-0.944562


In [136]:
df.iloc[[1, 2, 4], [0, 2]]


Unnamed: 0,A,C
2013-01-02,2.279505,1.701872
2013-01-03,1.126284,-1.23135
2013-01-05,0.060716,-0.328631


In [137]:
df.iloc[1:3, :]


Unnamed: 0,A,B,C,D
2013-01-02,2.279505,-0.697012,1.701872,-1.116281
2013-01-03,1.126284,0.544805,-1.23135,0.717632


In [138]:
df.iloc[:, 1:3]


Unnamed: 0,B,C
2013-01-01,-0.723114,-0.96995
2013-01-02,-0.697012,1.701872
2013-01-03,0.544805,-1.23135
2013-01-04,0.232228,0.477219
2013-01-05,-0.944562,-0.328631
2013-01-06,0.489903,0.687375


In [139]:
df.iloc[1, 1]


-0.6970124020925197

In [140]:
df.iat[1, 1]


-0.6970124020925197

In [141]:
# 조건을 사용하여 선택하기
print(df[df.A > 0], '\n')
print(df)


                   A         B         C         D
2013-01-02  2.279505 -0.697012  1.701872 -1.116281
2013-01-03  1.126284  0.544805 -1.231350  0.717632
2013-01-04  0.405992  0.232228  0.477219 -0.187534
2013-01-05  0.060716 -0.944562 -0.328631 -0.013425 

                   A         B         C         D
2013-01-01 -1.906958 -0.723114 -0.969950  1.134474
2013-01-02  2.279505 -0.697012  1.701872 -1.116281
2013-01-03  1.126284  0.544805 -1.231350  0.717632
2013-01-04  0.405992  0.232228  0.477219 -0.187534
2013-01-05  0.060716 -0.944562 -0.328631 -0.013425
2013-01-06 -0.144996  0.489903  0.687375 -0.127812


In [142]:
df[df > 0]


Unnamed: 0,A,B,C,D
2013-01-01,,,,1.134474
2013-01-02,2.279505,,1.701872,
2013-01-03,1.126284,0.544805,,0.717632
2013-01-04,0.405992,0.232228,0.477219,
2013-01-05,0.060716,,,
2013-01-06,,0.489903,0.687375,


In [143]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']


In [144]:
df2


Unnamed: 0,A,B,C,D,E
2013-01-01,-1.906958,-0.723114,-0.96995,1.134474,one
2013-01-02,2.279505,-0.697012,1.701872,-1.116281,one
2013-01-03,1.126284,0.544805,-1.23135,0.717632,two
2013-01-04,0.405992,0.232228,0.477219,-0.187534,three
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425,four
2013-01-06,-0.144996,0.489903,0.687375,-0.127812,three


In [145]:
df[df2['E'].isin(['two', 'four'])]


Unnamed: 0,A,B,C,D
2013-01-03,1.126284,0.544805,-1.23135,0.717632
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425


### 데이터 변경하기

In [146]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))


In [147]:
s1


2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [148]:
df['F'] = s1
df


Unnamed: 0,A,B,C,D,F
2013-01-01,-1.906958,-0.723114,-0.96995,1.134474,
2013-01-02,2.279505,-0.697012,1.701872,-1.116281,1.0
2013-01-03,1.126284,0.544805,-1.23135,0.717632,2.0
2013-01-04,0.405992,0.232228,0.477219,-0.187534,3.0
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425,4.0
2013-01-06,-0.144996,0.489903,0.687375,-0.127812,5.0


In [149]:
df.at[dates[0], 'A'] = 0


In [150]:
df.iat[0, 1] = 0


In [151]:
df.iat[0, 1] = 0


In [152]:
df


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.96995,1.134474,
2013-01-02,2.279505,-0.697012,1.701872,-1.116281,1.0
2013-01-03,1.126284,0.544805,-1.23135,0.717632,2.0
2013-01-04,0.405992,0.232228,0.477219,-0.187534,3.0
2013-01-05,0.060716,-0.944562,-0.328631,-0.013425,4.0
2013-01-06,-0.144996,0.489903,0.687375,-0.127812,5.0


In [153]:
df.loc[:, 'D'] = np.array([5] * len(df))


In [154]:
df


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.96995,5,
2013-01-02,2.279505,-0.697012,1.701872,5,1.0
2013-01-03,1.126284,0.544805,-1.23135,5,2.0
2013-01-04,0.405992,0.232228,0.477219,5,3.0
2013-01-05,0.060716,-0.944562,-0.328631,5,4.0
2013-01-06,-0.144996,0.489903,0.687375,5,5.0


In [155]:
df2 = df.copy()

df2[df2 > 0] = -df2


In [156]:
df2


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.96995,-5,
2013-01-02,-2.279505,-0.697012,-1.701872,-5,-1.0
2013-01-03,-1.126284,-0.544805,-1.23135,-5,-2.0
2013-01-04,-0.405992,-0.232228,-0.477219,-5,-3.0
2013-01-05,-0.060716,-0.944562,-0.328631,-5,-4.0
2013-01-06,-0.144996,-0.489903,-0.687375,-5,-5.0


### 4. 결측치

In [157]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0] : dates[1], 'E'] = 1
df1


Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.96995,5,,1.0
2013-01-02,2.279505,-0.697012,1.701872,5,1.0,1.0
2013-01-03,1.126284,0.544805,-1.23135,5,2.0,
2013-01-04,0.405992,0.232228,0.477219,5,3.0,


In [158]:
df1.dropna(how='any')


Unnamed: 0,A,B,C,D,F,E
2013-01-02,2.279505,-0.697012,1.701872,5,1.0,1.0


In [159]:
df1.fillna(value=5)


Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.96995,5,5.0,1.0
2013-01-02,2.279505,-0.697012,1.701872,5,1.0,1.0
2013-01-03,1.126284,0.544805,-1.23135,5,2.0,5.0
2013-01-04,0.405992,0.232228,0.477219,5,3.0,5.0


In [160]:
pd.isna(df1)


Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### 5. 연산 (Operations)

#### 통계적 지표들 (Stats)

In [161]:
df.mean()


A    0.621250
B   -0.062440
C    0.056089
D    5.000000
F    3.000000
dtype: float64

In [162]:
df.mean(1)


2013-01-01    1.007513
2013-01-02    1.856873
2013-01-03    1.487948
2013-01-04    1.823088
2013-01-05    1.557505
2013-01-06    2.206456
Freq: D, dtype: float64

In [163]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s


2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

#### 히스토그램 구하기 (Histogramming)

In [164]:
s = pd.Series(np.random.randint(0, 7, size=10))
s


0    4
1    3
2    2
3    1
4    4
5    2
6    4
7    2
8    6
9    4
dtype: int32

In [165]:
s.value_counts()


4    4
2    3
3    1
1    1
6    1
dtype: int64

#### 문자열 관련 메소드들 (String methods)

In [166]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str)
print(s.str.lower())


<pandas.core.strings.accessor.StringMethods object at 0x0000016C4232B430>
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object


### 6. 데이터 합치기 (Merge)

In [167]:
df = pd.DataFrame(np.random.randn(10, 4))
df


Unnamed: 0,0,1,2,3
0,0.724558,-1.114779,0.242927,-0.727041
1,-0.37073,0.316864,0.344352,-0.80446
2,1.269911,0.467031,1.813737,0.102701
3,-1.321904,-0.230324,-0.27484,0.14369
4,1.425006,-0.28499,-1.502049,2.526815
5,1.043601,0.259106,-1.971403,0.832804
6,0.553948,0.143673,0.667194,-0.208343
7,0.56106,-0.107693,0.443634,0.516496
8,-0.759906,-1.670684,-2.260646,-0.577327
9,-0.580848,1.992063,-0.662478,0.033656


In [168]:
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)


Unnamed: 0,0,1,2,3
0,0.724558,-1.114779,0.242927,-0.727041
1,-0.37073,0.316864,0.344352,-0.80446
2,1.269911,0.467031,1.813737,0.102701
3,-1.321904,-0.230324,-0.27484,0.14369
4,1.425006,-0.28499,-1.502049,2.526815
5,1.043601,0.259106,-1.971403,0.832804
6,0.553948,0.143673,0.667194,-0.208343
7,0.56106,-0.107693,0.443634,0.516496
8,-0.759906,-1.670684,-2.260646,-0.577327
9,-0.580848,1.992063,-0.662478,0.033656


In [169]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
print("==>> left: ", left)
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print("==>> right: ", right)
merged = pd.merge(left, right, on='key')
print("==>> merged: ", merged)



==>> left:     key  lval
0  foo     1
1  foo     2
==>> right:     key  rval
0  foo     4
1  foo     5
==>> merged:     key  lval  rval
0  foo     1     4
1  foo     1     5
2  foo     2     4
3  foo     2     5


In [170]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
print("==>> left: ", left)
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print("==>> right: ", right)

merged = pd.merge(left, right, on='key')
print("==>> merged: ", merged)



==>> left:     key  lval
0  foo     1
1  bar     2
==>> right:     key  rval
0  foo     4
1  bar     5
==>> merged:     key  lval  rval
0  foo     1     4
1  bar     2     5


In [171]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
print("==>> df: ", df)


==>> df:            A         B         C         D
0 -0.648228 -1.195253  1.715294  0.506749
1  0.219319  0.705364  0.153599  0.177535
2 -0.729061  1.519603  1.656057 -0.076663
3 -0.193819 -0.703292 -0.568850 -1.939374
4 -0.402652 -0.079004  0.476056 -1.707474
5  1.228183 -0.369170  1.341627  1.059213
6 -1.524865  0.105088  0.228375 -1.110861
7 -1.034970  2.261357  0.504570  1.194774


In [172]:
s = df.iloc[3]
print(df.append(s, ignore_index=True))


          A         B         C         D
0 -0.648228 -1.195253  1.715294  0.506749
1  0.219319  0.705364  0.153599  0.177535
2 -0.729061  1.519603  1.656057 -0.076663
3 -0.193819 -0.703292 -0.568850 -1.939374
4 -0.402652 -0.079004  0.476056 -1.707474
5  1.228183 -0.369170  1.341627  1.059213
6 -1.524865  0.105088  0.228375 -1.110861
7 -1.034970  2.261357  0.504570  1.194774
8 -0.193819 -0.703292 -0.568850 -1.939374


  print(df.append(s, ignore_index=True))


### 7. 묶기 (Grouping)

In [173]:
df = pd.DataFrame(
    {
        'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
        'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
        'C': np.random.randn(8),
        'D': np.random.randn(8),
    }
)
df


Unnamed: 0,A,B,C,D
0,foo,one,-0.199816,0.779842
1,bar,one,-0.143248,1.959185
2,foo,two,1.580175,-0.072732
3,bar,three,-1.171849,-0.300586
4,foo,two,0.070494,0.239233
5,bar,two,2.884979,0.082291
6,foo,one,-0.0049,-1.083739
7,foo,three,2.290871,0.383976


In [174]:
df.groupby('A').sum()


Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.569882,1.740891
foo,3.736824,0.246579


In [175]:
df.groupby('A').mean()


Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.523294,0.580297
foo,0.747365,0.049316


In [176]:
df.groupby(['A', 'B']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.143248,1.959185
bar,three,-1.171849,-0.300586
bar,two,2.884979,0.082291
foo,one,-0.204716,-0.303898
foo,three,2.290871,0.383976
foo,two,1.650669,0.166501


In [177]:
tuples = list(
    zip(
        *[
            ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
            ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
        ]
    )
)
tuples


[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [178]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index
# df = pd.DataFrame()



MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [179]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.197393,1.727133
bar,two,-1.066066,0.739803
baz,one,-0.643634,-0.664397
baz,two,-0.22407,-3.294098
foo,one,0.233283,1.126402
foo,two,0.010661,-0.212347
qux,one,0.817762,0.603818
qux,two,0.881503,-0.415921


In [180]:
df2 = df[:4]
df2


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.197393,1.727133
bar,two,-1.066066,0.739803
baz,one,-0.643634,-0.664397
baz,two,-0.22407,-3.294098


In [181]:
stacked = df2.stack()
stacked


first  second   
bar    one     A    1.197393
               B    1.727133
       two     A   -1.066066
               B    0.739803
baz    one     A   -0.643634
               B   -0.664397
       two     A   -0.224070
               B   -3.294098
dtype: float64

In [182]:
stacked.unstack()


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.197393,1.727133
bar,two,-1.066066,0.739803
baz,one,-0.643634,-0.664397
baz,two,-0.22407,-3.294098


In [183]:
stacked.unstack(0)


Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,1.197393,-0.643634
one,B,1.727133,-0.664397
two,A,-1.066066,-0.22407
two,B,0.739803,-3.294098


In [184]:
stacked.unstack(1)


Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,1.197393,-1.066066
bar,B,1.727133,0.739803
baz,A,-0.643634,-0.22407
baz,B,-0.664397,-3.294098


In [185]:
df = pd.DataFrame(
    {
        'A': ['one', 'one', 'two', 'three'] * 3,
        'B': ['A', 'B', 'C'] * 4,
        'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
        'D': np.random.randn(12),
        'E': np.random.randn(12),
    }
)
df


Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.679443,0.302792
1,one,B,foo,0.764042,-0.542827
2,two,C,foo,0.897647,-0.608202
3,three,A,bar,0.425432,-1.351913
4,one,B,bar,-0.458687,0.527873
5,one,C,bar,-1.829993,-0.191744
6,two,A,foo,-2.047694,-0.232949
7,three,B,foo,0.261047,0.107347
8,one,C,foo,-0.642159,1.35781
9,one,A,bar,1.427632,-0.079193


In [186]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])


Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,1.427632,0.679443
one,B,-0.458687,0.764042
one,C,-1.829993,-0.642159
three,A,0.425432,
three,B,,0.261047
three,C,0.711927,
two,A,,-2.047694
two,B,0.015311,
two,C,,0.897647


In [197]:
rng = pd.date_range('1/1/2012', periods=3000, freq='3Min')
rng


DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:03:00',
               '2012-01-01 00:06:00', '2012-01-01 00:09:00',
               '2012-01-01 00:12:00', '2012-01-01 00:15:00',
               '2012-01-01 00:18:00', '2012-01-01 00:21:00',
               '2012-01-01 00:24:00', '2012-01-01 00:27:00',
               ...
               '2012-01-07 05:30:00', '2012-01-07 05:33:00',
               '2012-01-07 05:36:00', '2012-01-07 05:39:00',
               '2012-01-07 05:42:00', '2012-01-07 05:45:00',
               '2012-01-07 05:48:00', '2012-01-07 05:51:00',
               '2012-01-07 05:54:00', '2012-01-07 05:57:00'],
              dtype='datetime64[ns]', length=3000, freq='3T')

In [198]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts


2012-01-01 00:00:00    330
2012-01-01 00:03:00    446
2012-01-01 00:06:00    328
2012-01-01 00:09:00    194
2012-01-01 00:12:00    214
                      ... 
2012-01-07 05:45:00      3
2012-01-07 05:48:00    380
2012-01-07 05:51:00    442
2012-01-07 05:54:00    276
2012-01-07 05:57:00    312
Freq: 3T, Length: 3000, dtype: int32

In [200]:
ts.resample('5s').sum()


2012-01-01 00:00:00    330
2012-01-01 00:00:05      0
2012-01-01 00:00:10      0
2012-01-01 00:00:15      0
2012-01-01 00:00:20      0
                      ... 
2012-01-07 05:56:40      0
2012-01-07 05:56:45      0
2012-01-07 05:56:50      0
2012-01-07 05:56:55      0
2012-01-07 05:57:00    312
Freq: 5S, Length: 107965, dtype: int32

In [202]:
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [203]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-03-06    0.472473
2012-03-07   -0.020105
2012-03-08    1.073533
2012-03-09    0.031576
2012-03-10    0.607293
Freq: D, dtype: float64

In [204]:
ts_utc = ts.tz_localize('UTC')
ts_utc

2012-03-06 00:00:00+00:00    0.472473
2012-03-07 00:00:00+00:00   -0.020105
2012-03-08 00:00:00+00:00    1.073533
2012-03-09 00:00:00+00:00    0.031576
2012-03-10 00:00:00+00:00    0.607293
Freq: D, dtype: float64

In [205]:
ts_utc.tz_convert('US/Eastern')

2012-03-05 19:00:00-05:00    0.472473
2012-03-06 19:00:00-05:00   -0.020105
2012-03-07 19:00:00-05:00    1.073533
2012-03-08 19:00:00-05:00    0.031576
2012-03-09 19:00:00-05:00    0.607293
Freq: D, dtype: float64

In [207]:
ts_utc.tz_convert('Asia/Seoul')

2012-03-06 09:00:00+09:00    0.472473
2012-03-07 09:00:00+09:00   -0.020105
2012-03-08 09:00:00+09:00    1.073533
2012-03-09 09:00:00+09:00    0.031576
2012-03-10 09:00:00+09:00    0.607293
Freq: D, dtype: float64

In [208]:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2012-01-31   -2.242412
2012-02-29   -0.477666
2012-03-31    0.095573
2012-04-30    1.047991
2012-05-31    0.005671
Freq: M, dtype: float64

In [210]:
ps = ts.to_period()
ps

2012-01   -2.242412
2012-02   -0.477666
2012-03    0.095573
2012-04    1.047991
2012-05    0.005671
Freq: M, dtype: float64

In [211]:
ps.to_timestamp()

2012-01-01   -2.242412
2012-02-01   -0.477666
2012-03-01    0.095573
2012-04-01    1.047991
2012-05-01    0.005671
Freq: MS, dtype: float64

In [212]:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)

ts

1990Q1    2.111204
1990Q2    0.333668
1990Q3   -1.610682
1990Q4    0.322020
1991Q1   -0.479109
1991Q2    1.271205
1991Q3    1.705348
1991Q4    0.233936
1992Q1    1.575578
1992Q2   -0.115850
1992Q3    1.208315
1992Q4    0.649365
1993Q1   -1.851510
1993Q2    1.424925
1993Q3    1.530641
1993Q4   -0.663783
1994Q1    0.161020
1994Q2    0.077564
1994Q3    1.368833
1994Q4    0.366306
1995Q1   -0.736995
1995Q2   -0.501622
1995Q3    1.765845
1995Q4    0.840456
1996Q1   -1.050143
1996Q2   -0.300163
1996Q3   -0.339951
1996Q4    1.145300
1997Q1   -0.991000
1997Q2    0.724223
1997Q3    1.581702
1997Q4    0.450711
1998Q1    0.023221
1998Q2    1.684005
1998Q3   -1.210415
1998Q4    1.892968
1999Q1   -0.852853
1999Q2   -0.039411
1999Q3   -1.835320
1999Q4   -2.757426
2000Q1   -0.461153
2000Q2   -1.599991
2000Q3   -0.101040
2000Q4   -2.292759
Freq: Q-NOV, dtype: float64

In [215]:
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts

1990-03-01 09:00    2.111204
1990-06-01 09:00    0.333668
1990-09-01 09:00   -1.610682
1990-12-01 09:00    0.322020
1991-03-01 09:00   -0.479109
1991-06-01 09:00    1.271205
1991-09-01 09:00    1.705348
1991-12-01 09:00    0.233936
1992-03-01 09:00    1.575578
1992-06-01 09:00   -0.115850
1992-09-01 09:00    1.208315
1992-12-01 09:00    0.649365
1993-03-01 09:00   -1.851510
1993-06-01 09:00    1.424925
1993-09-01 09:00    1.530641
1993-12-01 09:00   -0.663783
1994-03-01 09:00    0.161020
1994-06-01 09:00    0.077564
1994-09-01 09:00    1.368833
1994-12-01 09:00    0.366306
1995-03-01 09:00   -0.736995
1995-06-01 09:00   -0.501622
1995-09-01 09:00    1.765845
1995-12-01 09:00    0.840456
1996-03-01 09:00   -1.050143
1996-06-01 09:00   -0.300163
1996-09-01 09:00   -0.339951
1996-12-01 09:00    1.145300
1997-03-01 09:00   -0.991000
1997-06-01 09:00    0.724223
1997-09-01 09:00    1.581702
1997-12-01 09:00    0.450711
1998-03-01 09:00    0.023221
1998-06-01 09:00    1.684005
1998-09-01 09:

In [216]:
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                   'raw_grade': ['a', 'b', 'b', 'a', 'a', 'e']})

In [220]:
df['grade'] = df['raw_grade'].astype("category")

df['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [221]:
df['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: raw_grade, dtype: object

In [224]:
df['grade'].cat.categories = ['very good', 'good', 'very bad']
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [225]:
df['grade'] = df['grade'].cat.set_categories(['very bad', 'bad', 'medium', 'good', 'very good'])

df['grade']

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

In [226]:
df.sort_values(by='grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [227]:
df.groupby('grade').size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

### 11. 그래프로 표현하기 (Plotting)

### 12. 데이터 입/출력 (Getting Data In/Out)