# [10 Minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

## Object Creation(객체 생성)
- 참고 : [Intro to Data Structures](https://pandas.pydata.org/pandas-docs/stable/dsintro.html)

### [Series](https://pandas.pydata.org/pandas-docs/stable/dsintro.html#series)
- pandas는 값을 가지고 있는 리스트를 통해 Series를 생성
- 정수로 가진 인덱스를 기본 값으로 가져온다

In [4]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [5]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### [Dataframe](https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe)
- datetime 인덱스와 레이블이 있는 열을 가지고 있는 numpy 배열을 전달하여 데이터프레임 생성

#### 오늘 날짜 추출

In [6]:
from datetime import datetime

In [14]:
datetime.today()

datetime.datetime(2018, 8, 27, 1, 36, 8, 315292)

In [21]:
datetime.today().strftime("%Y%m%d")

'20180827'

In [22]:
today = datetime.today().strftime("%Y%m%d")

#### 날짜 배열 생성

In [27]:
dates = pd.date_range(today, periods=6)

In [28]:
dates

DatetimeIndex(['2018-08-27', '2018-08-28', '2018-08-29', '2018-08-30',
               '2018-08-31', '2018-09-01'],
              dtype='datetime64[ns]', freq='D')

#### numpy를 사용하여 2차원 배열 데이터 생성

In [31]:
datas = np.random.randn(6, 4)

In [32]:
datas

array([[-0.06173646, -0.68156076, -0.44873361, -0.05761538],
       [-0.1023543 , -0.0252107 , -1.87088288, -0.08944442],
       [ 0.19512268,  0.66734346, -0.14919626,  0.56600228],
       [ 0.55755981,  0.51919967,  0.44724606,  0.22570138],
       [ 1.29749539, -0.38016946, -0.12978099,  1.3854963 ],
       [-2.12412964, -0.0824245 ,  1.621552  ,  0.58639421]])

#### Dataframe 생성

In [33]:
df = pd.DataFrame(datas, index=dates, columns=list('ABCD'))

In [34]:
df

Unnamed: 0,A,B,C,D
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-29,0.195123,0.667343,-0.149196,0.566002
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,-0.380169,-0.129781,1.385496
2018-09-01,-2.12413,-0.082424,1.621552,0.586394


#### dict로 Dataframe 생성

In [35]:
df2 = pd.DataFrame({
    'A' : 1.,
    'B' : pd.Timestamp(today),
    'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
    'D' : np.array([3] * 4, dtype='int32'),
    'E' : pd.Categorical(["test", "train", "test", "train"]),
    'F' : 'foo'
})

In [36]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-08-27,1.0,3,test,foo
1,1.0,2018-08-27,1.0,3,train,foo
2,1.0,2018-08-27,1.0,3,test,foo
3,1.0,2018-08-27,1.0,3,train,foo


Dataframe의 열은 다양한 데이터 타입(dtype)으로 구성

In [37]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data (데이터 확인하기)
- 참고 : [Essential Basic Functionality](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics)

### Head and Tail

In [38]:
df.tail(3) # 끝에서 마지막 3줄

Unnamed: 0,A,B,C,D
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,-0.380169,-0.129781,1.385496
2018-09-01,-2.12413,-0.082424,1.621552,0.586394


In [39]:
df.tail() # 끝에서 마지막 5줄

Unnamed: 0,A,B,C,D
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-29,0.195123,0.667343,-0.149196,0.566002
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,-0.380169,-0.129781,1.385496
2018-09-01,-2.12413,-0.082424,1.621552,0.586394


In [40]:
df.head()

Unnamed: 0,A,B,C,D
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-29,0.195123,0.667343,-0.149196,0.566002
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,-0.380169,-0.129781,1.385496


### Attributes

In [41]:
df.index

DatetimeIndex(['2018-08-27', '2018-08-28', '2018-08-29', '2018-08-30',
               '2018-08-31', '2018-09-01'],
              dtype='datetime64[ns]', freq='D')

In [45]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [43]:
df.values

array([[-0.06173646, -0.68156076, -0.44873361, -0.05761538],
       [-0.1023543 , -0.0252107 , -1.87088288, -0.08944442],
       [ 0.19512268,  0.66734346, -0.14919626,  0.56600228],
       [ 0.55755981,  0.51919967,  0.44724606,  0.22570138],
       [ 1.29749539, -0.38016946, -0.12978099,  1.3854963 ],
       [-2.12412964, -0.0824245 ,  1.621552  ,  0.58639421]])

### [Summarizing data: describe](https://pandas.pydata.org/pandas-docs/stable/basics.html#summarizing-data-describe)
- 참고 : [pandas.DataFrame.describe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html#pandas.DataFrame.describe)
- 대략적인 통계적 정보

In [44]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.039674,0.002863,-0.088299,0.436089
std,1.144483,0.516014,1.142221,0.548641
min,-2.12413,-0.681561,-1.870883,-0.089444
25%,-0.0922,-0.305733,-0.373849,0.013214
50%,0.066693,-0.053818,-0.139489,0.395852
75%,0.466951,0.383097,0.302989,0.581296
max,1.297495,0.667343,1.621552,1.385496


### Transposition

In [46]:
df.T

Unnamed: 0,2018-08-27 00:00:00,2018-08-28 00:00:00,2018-08-29 00:00:00,2018-08-30 00:00:00,2018-08-31 00:00:00,2018-09-01 00:00:00
A,-0.061736,-0.102354,0.195123,0.55756,1.297495,-2.12413
B,-0.681561,-0.025211,0.667343,0.5192,-0.380169,-0.082424
C,-0.448734,-1.870883,-0.149196,0.447246,-0.129781,1.621552
D,-0.057615,-0.089444,0.566002,0.225701,1.385496,0.586394


### Sort

#### by index

In [47]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-08-27,-0.057615,-0.448734,-0.681561,-0.061736
2018-08-28,-0.089444,-1.870883,-0.025211,-0.102354
2018-08-29,0.566002,-0.149196,0.667343,0.195123
2018-08-30,0.225701,0.447246,0.5192,0.55756
2018-08-31,1.385496,-0.129781,-0.380169,1.297495
2018-09-01,0.586394,1.621552,-0.082424,-2.12413


In [48]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2018-09-01,-2.12413,-0.082424,1.621552,0.586394
2018-08-31,1.297495,-0.380169,-0.129781,1.385496
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-29,0.195123,0.667343,-0.149196,0.566002
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615


#### by value

In [52]:
df.sort_values('B')

Unnamed: 0,A,B,C,D
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615
2018-08-31,1.297495,-0.380169,-0.129781,1.385496
2018-09-01,-2.12413,-0.082424,1.621552,0.586394
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-29,0.195123,0.667343,-0.149196,0.566002


In [51]:
df.sort_values(by=['A', 'B'])

Unnamed: 0,A,B,C,D
2018-09-01,-2.12413,-0.082424,1.621552,0.586394
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615
2018-08-29,0.195123,0.667343,-0.149196,0.566002
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,-0.380169,-0.129781,1.385496


## Selection (선택)
- 참고
    - [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing)
    - [MultiIndex / Advanced Indexing](https://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced)

### Getting (데이터 얻기)

In [53]:
df['A']

2018-08-27   -0.061736
2018-08-28   -0.102354
2018-08-29    0.195123
2018-08-30    0.557560
2018-08-31    1.297495
2018-09-01   -2.124130
Freq: D, Name: A, dtype: float64

In [54]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-29,0.195123,0.667343,-0.149196,0.566002


In [56]:
df['2018-08-29' : '2018-09-01']

Unnamed: 0,A,B,C,D
2018-08-29,0.195123,0.667343,-0.149196,0.566002
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,-0.380169,-0.129781,1.385496
2018-09-01,-2.12413,-0.082424,1.621552,0.586394


### Selection by Label (Label을 통한 선택)
- 참고 : [Selection By Label](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label)

In [57]:
df.loc[dates[0]]

A   -0.061736
B   -0.681561
C   -0.448734
D   -0.057615
Name: 2018-08-27 00:00:00, dtype: float64

In [58]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2018-08-27,-0.061736,-0.681561
2018-08-28,-0.102354,-0.025211
2018-08-29,0.195123,0.667343
2018-08-30,0.55756,0.5192
2018-08-31,1.297495,-0.380169
2018-09-01,-2.12413,-0.082424


In [59]:
df.loc['2018-08-29' : '2018-09-01', ['A', 'B']]

Unnamed: 0,A,B
2018-08-29,0.195123,0.667343
2018-08-30,0.55756,0.5192
2018-08-31,1.297495,-0.380169
2018-09-01,-2.12413,-0.082424


In [60]:
df.loc['2018-08-29', ['A', 'B']]

A    0.195123
B    0.667343
Name: 2018-08-29 00:00:00, dtype: float64

In [61]:
df.loc[dates[0], 'A']

-0.061736464086950354

In [62]:
df.at[dates[0],'A']

-0.061736464086950354

### Selection by Position (위치로 선택하기)
- 참고 : [Selection By Position](https://pandas.pydata.org/pandas-docs/stable/indexing.html#selection-by-position)

In [63]:
df.iloc[3]

A    0.557560
B    0.519200
C    0.447246
D    0.225701
Name: 2018-08-30 00:00:00, dtype: float64

In [64]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-08-30,0.55756,0.5192
2018-08-31,1.297495,-0.380169


In [65]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2018-08-28,-0.102354,-1.870883
2018-08-29,0.195123,-0.149196
2018-08-31,1.297495,-0.129781


In [66]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444
2018-08-29,0.195123,0.667343,-0.149196,0.566002


In [67]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2018-08-27,-0.681561,-0.448734
2018-08-28,-0.025211,-1.870883
2018-08-29,0.667343,-0.149196
2018-08-30,0.5192,0.447246
2018-08-31,-0.380169,-0.129781
2018-09-01,-0.082424,1.621552


In [68]:
df.iloc[1,1]

-0.025210699185638185

In [69]:
df.iat[1,1]

-0.025210699185638185

### Boolean Indexing

In [70]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2018-08-29,0.195123,0.667343,-0.149196,0.566002
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,-0.380169,-0.129781,1.385496


In [71]:
df[df > 0]

Unnamed: 0,A,B,C,D
2018-08-27,,,,
2018-08-28,,,,
2018-08-29,0.195123,0.667343,,0.566002
2018-08-30,0.55756,0.5192,0.447246,0.225701
2018-08-31,1.297495,,,1.385496
2018-09-01,,,1.621552,0.586394


In [72]:
df2 = df.copy()

In [73]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [74]:
df2

Unnamed: 0,A,B,C,D,E
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615,one
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444,one
2018-08-29,0.195123,0.667343,-0.149196,0.566002,two
2018-08-30,0.55756,0.5192,0.447246,0.225701,three
2018-08-31,1.297495,-0.380169,-0.129781,1.385496,four
2018-09-01,-2.12413,-0.082424,1.621552,0.586394,three


In [75]:
df2[df2['E'].isin(['two', 'three'])]

Unnamed: 0,A,B,C,D,E
2018-08-29,0.195123,0.667343,-0.149196,0.566002,two
2018-08-30,0.55756,0.5192,0.447246,0.225701,three
2018-09-01,-2.12413,-0.082424,1.621552,0.586394,three


In [76]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2018-08-29,0.195123,0.667343,-0.149196,0.566002,two
2018-08-31,1.297495,-0.380169,-0.129781,1.385496,four


### Setting(설정)

In [77]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range(today, periods=6))

In [78]:
s1

2018-08-27    1
2018-08-28    2
2018-08-29    3
2018-08-30    4
2018-08-31    5
2018-09-01    6
Freq: D, dtype: int64

In [79]:
df['F'] = s1

In [80]:
df

Unnamed: 0,A,B,C,D,F
2018-08-27,-0.061736,-0.681561,-0.448734,-0.057615,1
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444,2
2018-08-29,0.195123,0.667343,-0.149196,0.566002,3
2018-08-30,0.55756,0.5192,0.447246,0.225701,4
2018-08-31,1.297495,-0.380169,-0.129781,1.385496,5
2018-09-01,-2.12413,-0.082424,1.621552,0.586394,6


In [81]:
df.at[dates[0], 'A'] = 0

In [82]:
df

Unnamed: 0,A,B,C,D,F
2018-08-27,0.0,-0.681561,-0.448734,-0.057615,1
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444,2
2018-08-29,0.195123,0.667343,-0.149196,0.566002,3
2018-08-30,0.55756,0.5192,0.447246,0.225701,4
2018-08-31,1.297495,-0.380169,-0.129781,1.385496,5
2018-09-01,-2.12413,-0.082424,1.621552,0.586394,6


In [83]:
df.iat[0,1] = 0

In [84]:
df

Unnamed: 0,A,B,C,D,F
2018-08-27,0.0,0.0,-0.448734,-0.057615,1
2018-08-28,-0.102354,-0.025211,-1.870883,-0.089444,2
2018-08-29,0.195123,0.667343,-0.149196,0.566002,3
2018-08-30,0.55756,0.5192,0.447246,0.225701,4
2018-08-31,1.297495,-0.380169,-0.129781,1.385496,5
2018-09-01,-2.12413,-0.082424,1.621552,0.586394,6


In [85]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [86]:
df

Unnamed: 0,A,B,C,D,F
2018-08-27,0.0,0.0,-0.448734,5,1
2018-08-28,-0.102354,-0.025211,-1.870883,5,2
2018-08-29,0.195123,0.667343,-0.149196,5,3
2018-08-30,0.55756,0.5192,0.447246,5,4
2018-08-31,1.297495,-0.380169,-0.129781,5,5
2018-09-01,-2.12413,-0.082424,1.621552,5,6


In [92]:
df2 = df.copy()

In [93]:
df2[df2 > 0]

Unnamed: 0,A,B,C,D,F
2018-08-27,,,,5,1
2018-08-28,,,,5,2
2018-08-29,0.195123,0.667343,,5,3
2018-08-30,0.55756,0.5192,0.447246,5,4
2018-08-31,1.297495,,,5,5
2018-09-01,,,1.621552,5,6


In [94]:
df2[df2 < 0]

Unnamed: 0,A,B,C,D,F
2018-08-27,,,-0.448734,,
2018-08-28,-0.102354,-0.025211,-1.870883,,
2018-08-29,,,-0.149196,,
2018-08-30,,,,,
2018-08-31,,-0.380169,-0.129781,,
2018-09-01,-2.12413,-0.082424,,,


In [95]:
df2[df2 > 0] = -df2

In [96]:
df2

Unnamed: 0,A,B,C,D,F
2018-08-27,0.0,0.0,-0.448734,-5,-1
2018-08-28,-0.102354,-0.025211,-1.870883,-5,-2
2018-08-29,-0.195123,-0.667343,-0.149196,-5,-3
2018-08-30,-0.55756,-0.5192,-0.447246,-5,-4
2018-08-31,-1.297495,-0.380169,-0.129781,-5,-5
2018-09-01,-2.12413,-0.082424,-1.621552,-5,-6
