In [1]:
import numpy as np
import pandas as pd

### pandas의 버전 확인
참고: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

In [2]:
pd.__version__

'0.24.1'

## 튜토리얼

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## date_range를 사용하여 날짜 생성

In [4]:
dates = pd.date_range('20180304', periods=20)
dates

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 임의의 배열(array)를생성

In [5]:
np.random.randn(6,4)

array([[-0.08969354,  0.95464256,  1.09938295,  1.65288768],
       [ 1.502855  ,  1.83702481, -1.14558312,  0.77058645],
       [ 0.29768374, -1.93340621,  0.18459824,  1.11554366],
       [-0.80437345, -1.22392399, -0.46431223,  0.53637923],
       [-0.35298449,  0.23601645,  0.04723933, -0.7361121 ],
       [-0.34738489,  0.34425056, -0.07188937,  0.09444385]])

### 임의로 생성한 배열을 데이터 프레임으로 생성

In [6]:
pd.DataFrame(np.random.randn(6, 4), index=['인덱스0', '인덱스1', '인덱스2','인덱스3', '인덱스4', '인덱스5'])

Unnamed: 0,0,1,2,3
인덱스0,-0.301929,0.543898,0.941315,-0.773378
인덱스1,-1.325607,-0.633871,-0.031207,0.620107
인덱스2,-0.609513,-0.236568,-0.723987,0.190791
인덱스3,-1.544463,0.536337,0.67246,-0.6168
인덱스4,1.082916,-0.860039,0.553683,1.37589
인덱스5,0.426813,-0.248675,-0.650023,0.498465


## 만약 위에서 생성한 dates를 index로 넣게 된다면?
행이 맞지 않기 때문에 valueError: Shoape of passed values is (4, 6) indices imply (4, 20)오류가 발생

### 꼭 values의 길이를 일치해야 에러없이 데이터 테이블이 생성된다
#### 다른 사람들도 보기 편하도록 데이터 테이블을 만들면 아래와 같이 shape, head를 넣어주자

In [7]:
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=['A','B','C','D'])
print(df.shape)
df.head()

(20, 4)


Unnamed: 0,A,B,C,D
2018-03-04,0.615132,-0.024019,0.040619,-0.078311
2018-03-05,0.76146,-0.834782,0.685667,0.340592
2018-03-06,-0.893394,-1.164413,-0.192959,-0.088627
2018-03-07,1.149938,0.104214,-0.8408,0.540714
2018-03-08,0.810439,0.120844,0.745565,-0.38789


### 직렬로 변환할 수 있는 object를 전달하여 DataFrame 생성하기

In [8]:
df2 = pd.DataFrame({
    'A':1.,
    'B':pd.Timestamp('20130102'),
    'C':pd.Series(1, index=list(range(4)),dtype='float32'),
    'D':np.array([3] * 4, dtype='int32'),
    'E':pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
})
print(df2.shape)
df2

(4, 6)


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## EB2: Viewing Data
> Data의 값을 찾고 해당 데이터의 변경을 해본다

### index 값 찾기

In [10]:
df.index

DatetimeIndex(['2018-03-04', '2018-03-05', '2018-03-06', '2018-03-07',
               '2018-03-08', '2018-03-09', '2018-03-10', '2018-03-11',
               '2018-03-12', '2018-03-13', '2018-03-14', '2018-03-15',
               '2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21', '2018-03-22', '2018-03-23'],
              dtype='datetime64[ns]', freq='D')

### 데이터의 columns의 값 찾기/변경하기

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### DataFrame의 요약 정보를 알아보기

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,0.270519,-0.007646,-0.031914,-0.198732
std,0.883725,0.846515,0.829337,0.728843
min,-1.439195,-1.591521,-1.954977,-1.353141
25%,-0.370402,-0.630605,-0.440869,-0.680069
50%,0.409147,0.023231,0.055756,-0.306167
75%,0.773705,0.474484,0.550104,0.366193
max,2.532447,1.835723,1.258518,1.729566


### index와 column을 바꿔주는 .T

In [13]:
df.T

Unnamed: 0,2018-03-04 00:00:00,2018-03-05 00:00:00,2018-03-06 00:00:00,2018-03-07 00:00:00,2018-03-08 00:00:00,2018-03-09 00:00:00,2018-03-10 00:00:00,2018-03-11 00:00:00,2018-03-12 00:00:00,2018-03-13 00:00:00,2018-03-14 00:00:00,2018-03-15 00:00:00,2018-03-16 00:00:00,2018-03-17 00:00:00,2018-03-18 00:00:00,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,2018-03-22 00:00:00,2018-03-23 00:00:00
A,0.615132,0.76146,-0.893394,1.149938,0.810439,0.116675,-0.526721,-1.439195,-0.367912,-0.377872,-0.253922,0.826759,0.749376,0.391902,0.558694,0.426393,1.085532,2.532447,-0.163207,-0.592138
B,-0.024019,-0.834782,-1.164413,0.104214,0.120844,-1.003177,0.548303,0.845312,-0.114117,0.449878,-1.591521,0.960912,0.067351,-0.141683,-0.02089,0.305568,-0.976237,1.042368,1.835723,-0.562545
C,0.040619,0.685667,-0.192959,-0.8408,0.745565,-0.261267,0.34095,-1.446329,-0.069174,1.122702,-0.367122,0.53114,-0.662109,1.258518,0.606994,0.244966,0.070894,-0.938509,-1.954977,0.446947
D,-0.078311,0.340592,-0.088627,0.540714,-0.38789,-1.353141,0.442995,-0.106072,-0.521754,-0.435641,-0.64916,-1.112076,1.729566,0.467958,-0.772798,-0.959256,-0.224445,0.603819,-0.786545,-0.624565


### 원하는 index기준으로 다시 정렬하기

In [14]:
df.sort_index(ascending=False).head()

Unnamed: 0,A,B,C,D
2018-03-23,-0.592138,-0.562545,0.446947,-0.624565
2018-03-22,-0.163207,1.835723,-1.954977,-0.786545
2018-03-21,2.532447,1.042368,-0.938509,0.603819
2018-03-20,1.085532,-0.976237,0.070894,-0.224445
2018-03-19,0.426393,0.305568,0.244966,-0.959256


### 특정 column을 기준으로 정렬하기
df.sort_values('<기준이 되는 column>')

In [15]:
df.sort_values('C').head()

Unnamed: 0,A,B,C,D
2018-03-22,-0.163207,1.835723,-1.954977,-0.786545
2018-03-11,-1.439195,0.845312,-1.446329,-0.106072
2018-03-21,2.532447,1.042368,-0.938509,0.603819
2018-03-07,1.149938,0.104214,-0.8408,0.540714
2018-03-16,0.749376,0.067351,-0.662109,1.729566


## EP3: Selection
특정 Column을 선택해서 데이터를 확인
2가지 방법은 모두 동일한 결과값을 가지지만 Column명이 "D-D"와 같은 경우에는 마이너스(-)연산자로 인식하여(데이터-데이터)로 계산을 한다

In [16]:
df['D'].head(3)

2018-03-04   -0.078311
2018-03-05    0.340592
2018-03-06   -0.088627
Freq: D, Name: D, dtype: float64

In [17]:
df.D.head(3)

2018-03-04   -0.078311
2018-03-05    0.340592
2018-03-06   -0.088627
Freq: D, Name: D, dtype: float64

### 슬라이싱하기
#### DataFrame에서 0이상 -3미만 값을 출력하기

In [18]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-03-04,0.615132,-0.024019,0.040619,-0.078311
2018-03-05,0.76146,-0.834782,0.685667,0.340592
2018-03-06,-0.893394,-1.164413,-0.192959,-0.088627


### DataFrame에서 A이상 F이하의 값을 출력하기

In [19]:
df['2018-03-04': '2018-03-06']

Unnamed: 0,A,B,C,D
2018-03-04,0.615132,-0.024019,0.040619,-0.078311
2018-03-05,0.76146,-0.834782,0.685667,0.340592
2018-03-06,-0.893394,-1.164413,-0.192959,-0.088627


### 위에서 선언한 변수에 0번째 index들의 값 출력하기

In [20]:
df.loc[:, ['A', 'D']].head() # 특정 index의 값을 찾기

Unnamed: 0,A,D
2018-03-04,0.615132,-0.078311
2018-03-05,0.76146,0.340592
2018-03-06,-0.893394,-0.088627
2018-03-07,1.149938,0.540714
2018-03-08,0.810439,-0.38789


In [21]:
df.loc['20180311': '20180316', ['A', 'C']]

Unnamed: 0,A,C
2018-03-11,-1.439195,-1.446329
2018-03-12,-0.367912,-0.069174
2018-03-13,-0.377872,1.122702
2018-03-14,-0.253922,-0.367122
2018-03-15,0.826759,0.53114
2018-03-16,0.749376,-0.662109


In [22]:
# 1개의 행만 지정했기 때문에 Series로 출력
df.loc['20180311', ['A', 'C']]

A   -1.439195
C   -1.446329
Name: 2018-03-11 00:00:00, dtype: float64

In [23]:
# 행이던 열이던 한줄의 데이터 범위를 선택하면 Series이다.
df.loc['20180311': '20180319', 'A']

2018-03-11   -1.439195
2018-03-12   -0.367912
2018-03-13   -0.377872
2018-03-14   -0.253922
2018-03-15    0.826759
2018-03-16    0.749376
2018-03-17    0.391902
2018-03-18    0.558694
2018-03-19    0.426393
Freq: D, Name: A, dtype: float64

### 세번째 행을 Selection하기

In [24]:
df.head()

Unnamed: 0,A,B,C,D
2018-03-04,0.615132,-0.024019,0.040619,-0.078311
2018-03-05,0.76146,-0.834782,0.685667,0.340592
2018-03-06,-0.893394,-1.164413,-0.192959,-0.088627
2018-03-07,1.149938,0.104214,-0.8408,0.540714
2018-03-08,0.810439,0.120844,0.745565,-0.38789


In [25]:
df.iloc[3] # index 3번째의 행 값을 출력

A    1.149938
B    0.104214
C   -0.840800
D    0.540714
Name: 2018-03-07 00:00:00, dtype: float64

In [26]:
df.iloc[3:5, 0:2] # 3월 7일부터 3월8일 // AA, BB 인덱스의 값 출력

Unnamed: 0,A,B
2018-03-07,1.149938,0.104214
2018-03-08,0.810439,0.120844


In [27]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2018-03-05,0.76146,0.685667
2018-03-06,-0.893394,-0.192959
2018-03-08,0.810439,0.745565


### 명시적으로 모든 행을 출력한다고 해줄 때는 범위가 없더라도 ( : )를 넣어준다

In [28]:
df.iloc[[1,3], :]

Unnamed: 0,A,B,C,D
2018-03-05,0.76146,-0.834782,0.685667,0.340592
2018-03-07,1.149938,0.104214,-0.8408,0.540714


### Boolean Indexing
DataFrame에서 0이상의 값을 찾아보기

In [29]:
df['A'] > 0

2018-03-04     True
2018-03-05     True
2018-03-06    False
2018-03-07     True
2018-03-08     True
2018-03-09     True
2018-03-10    False
2018-03-11    False
2018-03-12    False
2018-03-13    False
2018-03-14    False
2018-03-15     True
2018-03-16     True
2018-03-17     True
2018-03-18     True
2018-03-19     True
2018-03-20     True
2018-03-21     True
2018-03-22    False
2018-03-23    False
Freq: D, Name: A, dtype: bool

### AA인덱스에서 True값만 가져오기

In [30]:
mask = df['A'] > 0
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,0.615132,-0.024019,0.040619,-0.078311
2018-03-05,0.76146,-0.834782,0.685667,0.340592
2018-03-07,1.149938,0.104214,-0.8408,0.540714
2018-03-08,0.810439,0.120844,0.745565,-0.38789
2018-03-09,0.116675,-1.003177,-0.261267,-1.353141
2018-03-15,0.826759,0.960912,0.53114,-1.112076
2018-03-16,0.749376,0.067351,-0.662109,1.729566
2018-03-17,0.391902,-0.141683,1.258518,0.467958
2018-03-18,0.558694,-0.02089,0.606994,-0.772798
2018-03-19,0.426393,0.305568,0.244966,-0.959256


### df2로 복사하여 새로운 튜토리얼을 진행

In [31]:
df2 = df.copy()

In [32]:
df2['E'] = [
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
    'one', 'two', 'three', 'four', 'five',
]
df2.head()

Unnamed: 0,A,B,C,D,E
2018-03-04,0.615132,-0.024019,0.040619,-0.078311,one
2018-03-05,0.76146,-0.834782,0.685667,0.340592,two
2018-03-06,-0.893394,-1.164413,-0.192959,-0.088627,three
2018-03-07,1.149938,0.104214,-0.8408,0.540714,four
2018-03-08,0.810439,0.120844,0.745565,-0.38789,five


### 'one', 'two'에 해당하는 값만 가져오기

In [33]:
mask = df2.E.isin(['one', 'two'])
df[mask]

Unnamed: 0,A,B,C,D
2018-03-04,0.615132,-0.024019,0.040619,-0.078311
2018-03-05,0.76146,-0.834782,0.685667,0.340592
2018-03-09,0.116675,-1.003177,-0.261267,-1.353141
2018-03-10,-0.526721,0.548303,0.34095,0.442995
2018-03-14,-0.253922,-1.591521,-0.367122,-0.64916
2018-03-15,0.826759,0.960912,0.53114,-1.112076
2018-03-19,0.426393,0.305568,0.244966,-0.959256
2018-03-20,1.085532,-0.976237,0.070894,-0.224445
