In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20130101', periods=6)
dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,-0.799248,-0.411216,-1.19398,0.027767
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165
2013-01-04,1.25614,0.067496,1.088985,0.2678
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009
2013-01-06,0.595772,0.05426,-1.391269,0.334984


In [5]:
df2 = pd.DataFrame(
    {
        'A': 1.0,
        'B': pd.Timestamp('20130102'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3] * 4, dtype='int32'),
        'E': pd.Categorical(['test', 'train', 'test', 'train']),
        'F': 'foo',
    }
)

df2


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
dir(df2)


['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce_

In [8]:
df.head()


Unnamed: 0,A,B,C,D
2013-01-01,-0.799248,-0.411216,-1.19398,0.027767
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165
2013-01-04,1.25614,0.067496,1.088985,0.2678
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009


In [9]:
df.tail()


Unnamed: 0,A,B,C,D
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165
2013-01-04,1.25614,0.067496,1.088985,0.2678
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009
2013-01-06,0.595772,0.05426,-1.391269,0.334984


In [10]:
df.tail(3)


Unnamed: 0,A,B,C,D
2013-01-04,1.25614,0.067496,1.088985,0.2678
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009
2013-01-06,0.595772,0.05426,-1.391269,0.334984


In [11]:
df.index


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns


Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
df.values


array([[-0.79924819, -0.41121644, -1.19397952,  0.02776736],
       [ 0.09395819, -0.48951404, -0.46842075, -0.93105659],
       [ 0.19076153, -2.06507823, -0.32574281, -0.42416525],
       [ 1.25613951,  0.06749593,  1.08898471,  0.26780047],
       [-0.26964215, -1.06046323, -0.47060691, -1.99300935],
       [ 0.59577203,  0.05426007, -1.39126908,  0.3349841 ]])

In [14]:
df.describe()


Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.177957,-0.650753,-0.460172,-0.452947
std,0.707042,0.80802,0.874477,0.891359
min,-0.799248,-2.065078,-1.391269,-1.993009
25%,-0.178742,-0.917726,-1.013136,-0.804334
50%,0.14236,-0.450365,-0.469514,-0.198199
75%,0.494519,-0.062109,-0.361412,0.207792
max,1.25614,0.067496,1.088985,0.334984


In [15]:
df.T


Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.799248,0.093958,0.190762,1.25614,-0.269642,0.595772
B,-0.411216,-0.489514,-2.065078,0.067496,-1.060463,0.05426
C,-1.19398,-0.468421,-0.325743,1.088985,-0.470607,-1.391269
D,0.027767,-0.931057,-0.424165,0.2678,-1.993009,0.334984


In [16]:
df.sort_index(axis=1, ascending=False)


Unnamed: 0,D,C,B,A
2013-01-01,0.027767,-1.19398,-0.411216,-0.799248
2013-01-02,-0.931057,-0.468421,-0.489514,0.093958
2013-01-03,-0.424165,-0.325743,-2.065078,0.190762
2013-01-04,0.2678,1.088985,0.067496,1.25614
2013-01-05,-1.993009,-0.470607,-1.060463,-0.269642
2013-01-06,0.334984,-1.391269,0.05426,0.595772


In [17]:
df.sort_index(axis=1)


Unnamed: 0,A,B,C,D
2013-01-01,-0.799248,-0.411216,-1.19398,0.027767
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165
2013-01-04,1.25614,0.067496,1.088985,0.2678
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009
2013-01-06,0.595772,0.05426,-1.391269,0.334984


In [18]:
df.sort_values(by='B', ascending=False)


Unnamed: 0,A,B,C,D
2013-01-04,1.25614,0.067496,1.088985,0.2678
2013-01-06,0.595772,0.05426,-1.391269,0.334984
2013-01-01,-0.799248,-0.411216,-1.19398,0.027767
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165


In [19]:
# 3. 데이터 선택하기 (Selection)
df['A']


2013-01-01   -0.799248
2013-01-02    0.093958
2013-01-03    0.190762
2013-01-04    1.256140
2013-01-05   -0.269642
2013-01-06    0.595772
Freq: D, Name: A, dtype: float64

In [20]:
df.A


2013-01-01   -0.799248
2013-01-02    0.093958
2013-01-03    0.190762
2013-01-04    1.256140
2013-01-05   -0.269642
2013-01-06    0.595772
Freq: D, Name: A, dtype: float64

In [21]:
type(df['A'])


pandas.core.series.Series

In [22]:
df[0:3]


Unnamed: 0,A,B,C,D
2013-01-01,-0.799248,-0.411216,-1.19398,0.027767
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165


In [23]:
df['20130102':'20130104']


Unnamed: 0,A,B,C,D
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165
2013-01-04,1.25614,0.067496,1.088985,0.2678


In [24]:
df.loc[dates[0]]


A   -0.799248
B   -0.411216
C   -1.193980
D    0.027767
Name: 2013-01-01 00:00:00, dtype: float64

In [25]:
df.loc[dates[1]]


A    0.093958
B   -0.489514
C   -0.468421
D   -0.931057
Name: 2013-01-02 00:00:00, dtype: float64

In [26]:
df.loc['20130101']
df.loc['2013-01-01']


A   -0.799248
B   -0.411216
C   -1.193980
D    0.027767
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
df.loc[:, ['A', 'B']]



Unnamed: 0,A,B
2013-01-01,-0.799248,-0.411216
2013-01-02,0.093958,-0.489514
2013-01-03,0.190762,-2.065078
2013-01-04,1.25614,0.067496
2013-01-05,-0.269642,-1.060463
2013-01-06,0.595772,0.05426


In [28]:
df.loc['20130102':'20130104', ['A', 'B']]


Unnamed: 0,A,B
2013-01-02,0.093958,-0.489514
2013-01-03,0.190762,-2.065078
2013-01-04,1.25614,0.067496


In [29]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.093958,-0.489514
2013-01-03,0.190762,-2.065078
2013-01-04,1.25614,0.067496


In [30]:
df.loc[dates[0], ['A', 'B']]

A   -0.799248
B   -0.411216
Name: 2013-01-01 00:00:00, dtype: float64

In [31]:
df.loc[dates[0], 'A']

-0.7992481903149864

In [32]:
df.at[dates[0], 'A']

-0.7992481903149864

In [33]:
df.iloc[3]

A    1.256140
B    0.067496
C    1.088985
D    0.267800
Name: 2013-01-04 00:00:00, dtype: float64

In [34]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.25614,0.067496
2013-01-05,-0.269642,-1.060463


In [35]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.093958,-0.468421
2013-01-03,0.190762,-0.325743
2013-01-05,-0.269642,-0.470607


In [36]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165


In [37]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.411216,-1.19398
2013-01-02,-0.489514,-0.468421
2013-01-03,-2.065078,-0.325743
2013-01-04,0.067496,1.088985
2013-01-05,-1.060463,-0.470607
2013-01-06,0.05426,-1.391269


In [38]:
df.iloc[1, 1]

-0.489514041827999

In [39]:
df.iat[1, 1]

-0.489514041827999

In [40]:
# 조건을 사용하여 선택하기
print(df[df.A > 0], '\n')
print(df)

                   A         B         C         D
2013-01-02  0.093958 -0.489514 -0.468421 -0.931057
2013-01-03  0.190762 -2.065078 -0.325743 -0.424165
2013-01-04  1.256140  0.067496  1.088985  0.267800
2013-01-06  0.595772  0.054260 -1.391269  0.334984 

                   A         B         C         D
2013-01-01 -0.799248 -0.411216 -1.193980  0.027767
2013-01-02  0.093958 -0.489514 -0.468421 -0.931057
2013-01-03  0.190762 -2.065078 -0.325743 -0.424165
2013-01-04  1.256140  0.067496  1.088985  0.267800
2013-01-05 -0.269642 -1.060463 -0.470607 -1.993009
2013-01-06  0.595772  0.054260 -1.391269  0.334984


In [41]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,0.027767
2013-01-02,0.093958,,,
2013-01-03,0.190762,,,
2013-01-04,1.25614,0.067496,1.088985,0.2678
2013-01-05,,,,
2013-01-06,0.595772,0.05426,,0.334984


In [42]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [43]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.799248,-0.411216,-1.19398,0.027767,one
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057,one
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165,two
2013-01-04,1.25614,0.067496,1.088985,0.2678,three
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009,four
2013-01-06,0.595772,0.05426,-1.391269,0.334984,three


In [44]:
df[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009


### 데이터 변경하기

In [45]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))

In [46]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [47]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.799248,-0.411216,-1.19398,0.027767,
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057,1.0
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165,2.0
2013-01-04,1.25614,0.067496,1.088985,0.2678,3.0
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009,4.0
2013-01-06,0.595772,0.05426,-1.391269,0.334984,5.0


In [48]:
df.at[dates[0], 'A'] = 0

In [49]:
df.iat[0, 1] = 0

In [50]:
df.iat[0, 1] = 0

In [51]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.19398,0.027767,
2013-01-02,0.093958,-0.489514,-0.468421,-0.931057,1.0
2013-01-03,0.190762,-2.065078,-0.325743,-0.424165,2.0
2013-01-04,1.25614,0.067496,1.088985,0.2678,3.0
2013-01-05,-0.269642,-1.060463,-0.470607,-1.993009,4.0
2013-01-06,0.595772,0.05426,-1.391269,0.334984,5.0


In [52]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [53]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.19398,5,
2013-01-02,0.093958,-0.489514,-0.468421,5,1.0
2013-01-03,0.190762,-2.065078,-0.325743,5,2.0
2013-01-04,1.25614,0.067496,1.088985,5,3.0
2013-01-05,-0.269642,-1.060463,-0.470607,5,4.0
2013-01-06,0.595772,0.05426,-1.391269,5,5.0


In [54]:
df2 = df.copy()

df2[df2 > 0] = -df2

In [55]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.19398,-5,
2013-01-02,-0.093958,-0.489514,-0.468421,-5,-1.0
2013-01-03,-0.190762,-2.065078,-0.325743,-5,-2.0
2013-01-04,-1.25614,-0.067496,-1.088985,-5,-3.0
2013-01-05,-0.269642,-1.060463,-0.470607,-5,-4.0
2013-01-06,-0.595772,-0.05426,-1.391269,-5,-5.0


### 4. 결측치

In [59]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-1.19398,5,,1.0
2013-01-02,0.093958,-0.489514,-0.468421,5,1.0,1.0
2013-01-03,0.190762,-2.065078,-0.325743,5,2.0,
2013-01-04,1.25614,0.067496,1.088985,5,3.0,


In [61]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.093958,-0.489514,-0.468421,5,1.0,1.0


In [62]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-1.19398,5,5.0,1.0
2013-01-02,0.093958,-0.489514,-0.468421,5,1.0,1.0
2013-01-03,0.190762,-2.065078,-0.325743,5,2.0,5.0
2013-01-04,1.25614,0.067496,1.088985,5,3.0,5.0


In [63]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### 5. 연산 (Operations)

#### 통계적 지표들 (Stats)

In [64]:
df.mean()

A    0.311165
B   -0.582217
C   -0.460172
D    5.000000
F    3.000000
dtype: float64

In [65]:
df.mean(1)

2013-01-01    0.951505
2013-01-02    1.027205
2013-01-03    0.959988
2013-01-04    2.082524
2013-01-05    1.439858
2013-01-06    1.851753
Freq: D, dtype: float64

In [72]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64