# [10 Minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

## Object Creation(객체 생성)
- 참고 : [Intro to Data Structures](https://pandas.pydata.org/pandas-docs/stable/dsintro.html)

### [Series](https://pandas.pydata.org/pandas-docs/stable/dsintro.html#series)
- pandas는 값을 가지고 있는 리스트를 통해 Series를 생성
- 정수로 가진 인덱스를 기본 값으로 가져온다

In [4]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [5]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### [Dataframe](https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe)
- datetime 인덱스와 레이블이 있는 열을 가지고 있는 numpy 배열을 전달하여 데이터프레임 생성

#### 오늘 날짜 추출

In [6]:
from datetime import datetime

In [7]:
datetime.today()

datetime.datetime(2018, 8, 30, 13, 44, 8, 335427)

In [8]:
datetime.today().strftime("%Y%m%d")

'20180830'

In [9]:
today = datetime.today().strftime("%Y%m%d")

#### 날짜 배열 생성

In [10]:
dates = pd.date_range(today, periods=6)

In [11]:
dates

DatetimeIndex(['2018-08-30', '2018-08-31', '2018-09-01', '2018-09-02',
               '2018-09-03', '2018-09-04'],
              dtype='datetime64[ns]', freq='D')

#### numpy를 사용하여 2차원 배열 데이터 생성

In [12]:
datas = np.random.randn(6, 4)

In [13]:
datas

array([[-0.53343198, -0.28683034, -0.12035581, -1.74092523],
       [ 0.51643242, -1.04778709, -0.2803233 ,  0.49637965],
       [-0.41862667, -0.61260706,  0.24391742, -1.45707158],
       [-0.69881565, -0.52086483,  0.52744398,  0.09743004],
       [ 1.17096886, -0.27160463,  0.2887567 ,  1.64836442],
       [ 0.13094684, -0.59872496,  0.48710124, -1.58733382]])

#### Dataframe 생성

In [14]:
df = pd.DataFrame(datas, index=dates, columns=list('ABCD'))

In [15]:
df

Unnamed: 0,A,B,C,D
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072
2018-09-02,-0.698816,-0.520865,0.527444,0.09743
2018-09-03,1.170969,-0.271605,0.288757,1.648364
2018-09-04,0.130947,-0.598725,0.487101,-1.587334


#### dict로 Dataframe 생성

In [16]:
df2 = pd.DataFrame({
    'A' : 1.,
    'B' : pd.Timestamp(today),
    'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
    'D' : np.array([3] * 4, dtype='int32'),
    'E' : pd.Categorical(["test", "train", "test", "train"]),
    'F' : 'foo'
})

In [17]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-08-30,1.0,3,test,foo
1,1.0,2018-08-30,1.0,3,train,foo
2,1.0,2018-08-30,1.0,3,test,foo
3,1.0,2018-08-30,1.0,3,train,foo


Dataframe의 열은 다양한 데이터 타입(dtype)으로 구성

In [18]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data (데이터 확인하기)
- 참고 : [Essential Basic Functionality](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics)

### Head and Tail

In [19]:
df.tail(3) # 끝에서 마지막 3줄

Unnamed: 0,A,B,C,D
2018-09-02,-0.698816,-0.520865,0.527444,0.09743
2018-09-03,1.170969,-0.271605,0.288757,1.648364
2018-09-04,0.130947,-0.598725,0.487101,-1.587334


In [20]:
df.tail() # 끝에서 마지막 5줄

Unnamed: 0,A,B,C,D
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072
2018-09-02,-0.698816,-0.520865,0.527444,0.09743
2018-09-03,1.170969,-0.271605,0.288757,1.648364
2018-09-04,0.130947,-0.598725,0.487101,-1.587334


In [21]:
df.head()

Unnamed: 0,A,B,C,D
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072
2018-09-02,-0.698816,-0.520865,0.527444,0.09743
2018-09-03,1.170969,-0.271605,0.288757,1.648364


### Attributes

In [22]:
df.index

DatetimeIndex(['2018-08-30', '2018-08-31', '2018-09-01', '2018-09-02',
               '2018-09-03', '2018-09-04'],
              dtype='datetime64[ns]', freq='D')

In [23]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [24]:
df.values

array([[-0.53343198, -0.28683034, -0.12035581, -1.74092523],
       [ 0.51643242, -1.04778709, -0.2803233 ,  0.49637965],
       [-0.41862667, -0.61260706,  0.24391742, -1.45707158],
       [-0.69881565, -0.52086483,  0.52744398,  0.09743004],
       [ 1.17096886, -0.27160463,  0.2887567 ,  1.64836442],
       [ 0.13094684, -0.59872496,  0.48710124, -1.58733382]])

### [Summarizing data: describe](https://pandas.pydata.org/pandas-docs/stable/basics.html#summarizing-data-describe)
- 참고 : [pandas.DataFrame.describe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html#pandas.DataFrame.describe)
- 대략적인 통계적 정보

In [25]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.027912,-0.556403,0.19109,-0.423859
std,0.7209,0.283355,0.326287,1.38337
min,-0.698816,-1.047787,-0.280323,-1.740925
25%,-0.504731,-0.609137,-0.029288,-1.554768
50%,-0.14384,-0.559795,0.266337,-0.679821
75%,0.420061,-0.345339,0.437515,0.396642
max,1.170969,-0.271605,0.527444,1.648364


### Transposition

In [26]:
df.T

Unnamed: 0,2018-08-30 00:00:00,2018-08-31 00:00:00,2018-09-01 00:00:00,2018-09-02 00:00:00,2018-09-03 00:00:00,2018-09-04 00:00:00
A,-0.533432,0.516432,-0.418627,-0.698816,1.170969,0.130947
B,-0.28683,-1.047787,-0.612607,-0.520865,-0.271605,-0.598725
C,-0.120356,-0.280323,0.243917,0.527444,0.288757,0.487101
D,-1.740925,0.49638,-1.457072,0.09743,1.648364,-1.587334


### Sort

#### by index

In [27]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-08-30,-1.740925,-0.120356,-0.28683,-0.533432
2018-08-31,0.49638,-0.280323,-1.047787,0.516432
2018-09-01,-1.457072,0.243917,-0.612607,-0.418627
2018-09-02,0.09743,0.527444,-0.520865,-0.698816
2018-09-03,1.648364,0.288757,-0.271605,1.170969
2018-09-04,-1.587334,0.487101,-0.598725,0.130947


In [28]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2018-09-04,0.130947,-0.598725,0.487101,-1.587334
2018-09-03,1.170969,-0.271605,0.288757,1.648364
2018-09-02,-0.698816,-0.520865,0.527444,0.09743
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925


#### by value

In [29]:
df.sort_values('B')

Unnamed: 0,A,B,C,D
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072
2018-09-04,0.130947,-0.598725,0.487101,-1.587334
2018-09-02,-0.698816,-0.520865,0.527444,0.09743
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925
2018-09-03,1.170969,-0.271605,0.288757,1.648364


In [30]:
df.sort_values(by=['A', 'B'])

Unnamed: 0,A,B,C,D
2018-09-02,-0.698816,-0.520865,0.527444,0.09743
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072
2018-09-04,0.130947,-0.598725,0.487101,-1.587334
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-03,1.170969,-0.271605,0.288757,1.648364


## Selection (선택)
- 참고
    - [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing)
    - [MultiIndex / Advanced Indexing](https://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced)

### Getting (데이터 얻기)

In [31]:
df['A']

2018-08-30   -0.533432
2018-08-31    0.516432
2018-09-01   -0.418627
2018-09-02   -0.698816
2018-09-03    1.170969
2018-09-04    0.130947
Freq: D, Name: A, dtype: float64

In [32]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072


In [33]:
df['2018-08-29' : '2018-09-01']

Unnamed: 0,A,B,C,D
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072


### Selection by Label (Label을 통한 선택)
- 참고 : [Selection By Label](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label)

In [34]:
df.loc[dates[0]]

A   -0.533432
B   -0.286830
C   -0.120356
D   -1.740925
Name: 2018-08-30 00:00:00, dtype: float64

In [35]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2018-08-30,-0.533432,-0.28683
2018-08-31,0.516432,-1.047787
2018-09-01,-0.418627,-0.612607
2018-09-02,-0.698816,-0.520865
2018-09-03,1.170969,-0.271605
2018-09-04,0.130947,-0.598725


In [36]:
df.loc[dates[1] : dates[4], ['A', 'B']]

Unnamed: 0,A,B
2018-08-31,0.516432,-1.047787
2018-09-01,-0.418627,-0.612607
2018-09-02,-0.698816,-0.520865
2018-09-03,1.170969,-0.271605


In [37]:
df.loc[dates[1] , ['A', 'B']]

A    0.516432
B   -1.047787
Name: 2018-08-31 00:00:00, dtype: float64

In [38]:
df.loc[dates[0], 'A']

-0.5334319753425261

In [39]:
df.at[dates[0],'A']

-0.5334319753425261

### Selection by Position (위치로 선택하기)
- 참고 : [Selection By Position](https://pandas.pydata.org/pandas-docs/stable/indexing.html#selection-by-position)

In [40]:
df.iloc[3]

A   -0.698816
B   -0.520865
C    0.527444
D    0.097430
Name: 2018-09-02 00:00:00, dtype: float64

In [41]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-09-02,-0.698816,-0.520865
2018-09-03,1.170969,-0.271605


In [42]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2018-08-31,0.516432,-0.280323
2018-09-01,-0.418627,0.243917
2018-09-03,1.170969,0.288757


In [43]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072


In [44]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2018-08-30,-0.28683,-0.120356
2018-08-31,-1.047787,-0.280323
2018-09-01,-0.612607,0.243917
2018-09-02,-0.520865,0.527444
2018-09-03,-0.271605,0.288757
2018-09-04,-0.598725,0.487101


In [45]:
df.iloc[1,1]

-1.047787092247408

In [46]:
df.iat[1,1]

-1.047787092247408

### Boolean Indexing

In [47]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2018-08-31,0.516432,-1.047787,-0.280323,0.49638
2018-09-03,1.170969,-0.271605,0.288757,1.648364
2018-09-04,0.130947,-0.598725,0.487101,-1.587334


In [48]:
df[df > 0]

Unnamed: 0,A,B,C,D
2018-08-30,,,,
2018-08-31,0.516432,,,0.49638
2018-09-01,,,0.243917,
2018-09-02,,,0.527444,0.09743
2018-09-03,1.170969,,0.288757,1.648364
2018-09-04,0.130947,,0.487101,


In [49]:
df2 = df.copy()

In [50]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [51]:
df2

Unnamed: 0,A,B,C,D,E
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925,one
2018-08-31,0.516432,-1.047787,-0.280323,0.49638,one
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072,two
2018-09-02,-0.698816,-0.520865,0.527444,0.09743,three
2018-09-03,1.170969,-0.271605,0.288757,1.648364,four
2018-09-04,0.130947,-0.598725,0.487101,-1.587334,three


In [52]:
df2[df2['E'].isin(['two', 'three'])]

Unnamed: 0,A,B,C,D,E
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072,two
2018-09-02,-0.698816,-0.520865,0.527444,0.09743,three
2018-09-04,0.130947,-0.598725,0.487101,-1.587334,three


In [53]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072,two
2018-09-03,1.170969,-0.271605,0.288757,1.648364,four


### Setting(설정)

In [54]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range(today, periods=6))

In [55]:
s1

2018-08-30    1
2018-08-31    2
2018-09-01    3
2018-09-02    4
2018-09-03    5
2018-09-04    6
Freq: D, dtype: int64

In [56]:
df['F'] = s1

In [57]:
df

Unnamed: 0,A,B,C,D,F
2018-08-30,-0.533432,-0.28683,-0.120356,-1.740925,1
2018-08-31,0.516432,-1.047787,-0.280323,0.49638,2
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072,3
2018-09-02,-0.698816,-0.520865,0.527444,0.09743,4
2018-09-03,1.170969,-0.271605,0.288757,1.648364,5
2018-09-04,0.130947,-0.598725,0.487101,-1.587334,6


In [58]:
df.at[dates[0], 'A'] = 0

In [59]:
df

Unnamed: 0,A,B,C,D,F
2018-08-30,0.0,-0.28683,-0.120356,-1.740925,1
2018-08-31,0.516432,-1.047787,-0.280323,0.49638,2
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072,3
2018-09-02,-0.698816,-0.520865,0.527444,0.09743,4
2018-09-03,1.170969,-0.271605,0.288757,1.648364,5
2018-09-04,0.130947,-0.598725,0.487101,-1.587334,6


In [60]:
df.iat[0,1] = 0

In [61]:
df

Unnamed: 0,A,B,C,D,F
2018-08-30,0.0,0.0,-0.120356,-1.740925,1
2018-08-31,0.516432,-1.047787,-0.280323,0.49638,2
2018-09-01,-0.418627,-0.612607,0.243917,-1.457072,3
2018-09-02,-0.698816,-0.520865,0.527444,0.09743,4
2018-09-03,1.170969,-0.271605,0.288757,1.648364,5
2018-09-04,0.130947,-0.598725,0.487101,-1.587334,6


In [62]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [63]:
df

Unnamed: 0,A,B,C,D,F
2018-08-30,0.0,0.0,-0.120356,5,1
2018-08-31,0.516432,-1.047787,-0.280323,5,2
2018-09-01,-0.418627,-0.612607,0.243917,5,3
2018-09-02,-0.698816,-0.520865,0.527444,5,4
2018-09-03,1.170969,-0.271605,0.288757,5,5
2018-09-04,0.130947,-0.598725,0.487101,5,6


In [64]:
df2 = df.copy()

In [65]:
df2[df2 > 0]

Unnamed: 0,A,B,C,D,F
2018-08-30,,,,5,1
2018-08-31,0.516432,,,5,2
2018-09-01,,,0.243917,5,3
2018-09-02,,,0.527444,5,4
2018-09-03,1.170969,,0.288757,5,5
2018-09-04,0.130947,,0.487101,5,6


In [66]:
df2[df2 < 0]

Unnamed: 0,A,B,C,D,F
2018-08-30,,,-0.120356,,
2018-08-31,,-1.047787,-0.280323,,
2018-09-01,-0.418627,-0.612607,,,
2018-09-02,-0.698816,-0.520865,,,
2018-09-03,,-0.271605,,,
2018-09-04,,-0.598725,,,


In [67]:
df2[df2 > 0] = -df2

In [68]:
df2

Unnamed: 0,A,B,C,D,F
2018-08-30,0.0,0.0,-0.120356,-5,-1
2018-08-31,-0.516432,-1.047787,-0.280323,-5,-2
2018-09-01,-0.418627,-0.612607,-0.243917,-5,-3
2018-09-02,-0.698816,-0.520865,-0.527444,-5,-4
2018-09-03,-1.170969,-0.271605,-0.288757,-5,-5
2018-09-04,-0.130947,-0.598725,-0.487101,-5,-6


### Missing Data (결측치)
- - 참고 : [Missing data section](https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data)

- Pandas는 결측치를 표현하기 위해 주로 np.nan 값을 사용
    - 기본 설정값이지만 계산에는 포함되지 않는다.

#### Reindexing
- 지정된 축 상의 인덱스를 변경/추가/삭제
- 데이터의 복사본을 반환

In [69]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [70]:
df1

Unnamed: 0,A,B,C,D,F,E
2018-08-30,0.0,0.0,-0.120356,5,1,
2018-08-31,0.516432,-1.047787,-0.280323,5,2,
2018-09-01,-0.418627,-0.612607,0.243917,5,3,
2018-09-02,-0.698816,-0.520865,0.527444,5,4,


In [71]:
df1.loc[dates[0]:dates[1], 'E'] = 1

In [72]:
df1

Unnamed: 0,A,B,C,D,F,E
2018-08-30,0.0,0.0,-0.120356,5,1,1.0
2018-08-31,0.516432,-1.047787,-0.280323,5,2,1.0
2018-09-01,-0.418627,-0.612607,0.243917,5,3,
2018-09-02,-0.698816,-0.520865,0.527444,5,4,


In [73]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2018-08-30,0.0,0.0,-0.120356,5,1,1.0
2018-08-31,0.516432,-1.047787,-0.280323,5,2,1.0


In [74]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2018-08-30,0.0,0.0,-0.120356,5,1,1.0
2018-08-31,0.516432,-1.047787,-0.280323,5,2,1.0
2018-09-01,-0.418627,-0.612607,0.243917,5,3,5.0
2018-09-02,-0.698816,-0.520865,0.527444,5,4,5.0


- nan인 값에 boolean을 통한 표식을 얻음
    - nan인 값에만 True가 표시되게 하는 함수
    - 데이터프레임의 모든 값이 boolean 형태로 표시

In [75]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2018-08-30,False,False,False,False,False,False
2018-08-31,False,False,False,False,False,False
2018-09-01,False,False,False,False,False,True
2018-09-02,False,False,False,False,False,True


### Operation (연산)
- 참고 : [이진 (Binary) 연산의 기본 섹션](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-binop)

#### Stats (통계)
- 일반적으로 결측치를 제외한 후 연산
- 기술 통계를 수행

In [76]:
df.mean()

A    0.116818
B   -0.508598
C    0.191090
D    5.000000
F    3.500000
dtype: float64

In [77]:
df.mean(1)

2018-08-30    1.175929
2018-08-31    1.237664
2018-09-01    1.442537
2018-09-02    1.661553
2018-09-03    2.237624
2018-09-04    2.203865
Freq: D, dtype: float64

- 차원이 다른 객체로 연산
    - pandas는 지정된 치원을 따라 자동으로 브로드 캐스팅

> broadcast란 numpy에서 유래한 용어로, n차원이나 스칼라 값으로 연산을 수행할 때 도출되는 결과의 규칙을 설명하는 것

In [78]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates)

In [79]:
s

2018-08-30    1.0
2018-08-31    3.0
2018-09-01    5.0
2018-09-02    NaN
2018-09-03    6.0
2018-09-04    8.0
Freq: D, dtype: float64

In [80]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)

In [81]:
s

2018-08-30    NaN
2018-08-31    NaN
2018-09-01    1.0
2018-09-02    3.0
2018-09-03    5.0
2018-09-04    NaN
Freq: D, dtype: float64

In [82]:
df

Unnamed: 0,A,B,C,D,F
2018-08-30,0.0,0.0,-0.120356,5,1
2018-08-31,0.516432,-1.047787,-0.280323,5,2
2018-09-01,-0.418627,-0.612607,0.243917,5,3
2018-09-02,-0.698816,-0.520865,0.527444,5,4
2018-09-03,1.170969,-0.271605,0.288757,5,5
2018-09-04,0.130947,-0.598725,0.487101,5,6


In [83]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2018-08-30,,,,,
2018-08-31,,,,,
2018-09-01,-1.418627,-1.612607,-0.756083,4.0,2.0
2018-09-02,-3.698816,-3.520865,-2.472556,2.0,1.0
2018-09-03,-3.829031,-5.271605,-4.711243,0.0,0.0
2018-09-04,,,,,


#### Apply (적용)

In [84]:
df

Unnamed: 0,A,B,C,D,F
2018-08-30,0.0,0.0,-0.120356,5,1
2018-08-31,0.516432,-1.047787,-0.280323,5,2
2018-09-01,-0.418627,-0.612607,0.243917,5,3
2018-09-02,-0.698816,-0.520865,0.527444,5,4
2018-09-03,1.170969,-0.271605,0.288757,5,5
2018-09-04,0.130947,-0.598725,0.487101,5,6


In [85]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2018-08-30,0.0,0.0,-0.120356,5,1
2018-08-31,0.516432,-1.047787,-0.400679,10,3
2018-09-01,0.097806,-1.660394,-0.156762,15,6
2018-09-02,-0.60101,-2.181259,0.370682,20,10
2018-09-03,0.569959,-2.452864,0.659439,25,15
2018-09-04,0.700906,-3.051589,1.14654,30,21


In [86]:
df.apply(lambda x: x.max() - x.min())

A    1.869785
B    1.047787
C    0.807767
D    0.000000
F    5.000000
dtype: float64

#### Histogramming (히스토그래밍)
- 참고 : [Histogramming and Discretization (히스토그래밍과 이산화)](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-discretization)

In [87]:
s = pd.Series(np.random.randint(0, 7, size=10))

In [88]:
s

0    4
1    5
2    1
3    3
4    3
5    4
6    0
7    2
8    5
9    6
dtype: int32

In [89]:
s.value_counts()

5    2
4    2
3    2
6    1
2    1
1    1
0    1
dtype: int64

#### String Methods (문자열 메소드)
- 참고 : [벡터화된 문자열 메소드](https://pandas.pydata.org/pandas-docs/stable/text.html#text-string-methods)

- Series는 문자열 처리 메소드 모음(set)을 가지고 있다.
- 배열의 각 요소를 쉽게 조작할 수 있도록 만들어주는 문자열의 속성에 포함
- 문자열의 패턴 일치 확인은 기본적으로 정규 표현식을 사용
- 몇몇의 경우에는 항상 정규 표현식 사용

In [90]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [91]:
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [92]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

### Merge (병합)
- 참고 : [Merging](https://pandas.pydata.org/pandas-docs/stable/merging.html#merging)

- [`concat()`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html#pandas.concat)으로 객체를 연결

In [93]:
df = pd.DataFrame(np.random.randn(10, 4))

In [94]:
df

Unnamed: 0,0,1,2,3
0,1.439092,-0.743855,-0.576979,0.627477
1,0.503925,0.782436,0.610047,-0.509657
2,-0.376753,-1.519034,0.608818,-1.504769
3,-0.816946,-0.413386,-0.080414,-0.557007
4,-0.785825,-0.6206,-0.228781,-0.304346
5,-0.795744,-1.378859,-0.100353,-0.102948
6,-0.529604,0.715059,0.287876,-0.235978
7,0.871331,0.250678,-2.043929,-1.269256
8,0.453958,1.254075,-2.073782,-1.549154
9,-0.009597,-2.060836,1.092711,-0.683582


In [95]:
pieces = [df[:3], df[3:7], df[7:]]

In [97]:
pieces

[          0         1         2         3
 0  1.439092 -0.743855 -0.576979  0.627477
 1  0.503925  0.782436  0.610047 -0.509657
 2 -0.376753 -1.519034  0.608818 -1.504769,
           0         1         2         3
 3 -0.816946 -0.413386 -0.080414 -0.557007
 4 -0.785825 -0.620600 -0.228781 -0.304346
 5 -0.795744 -1.378859 -0.100353 -0.102948
 6 -0.529604  0.715059  0.287876 -0.235978,
           0         1         2         3
 7  0.871331  0.250678 -2.043929 -1.269256
 8  0.453958  1.254075 -2.073782 -1.549154
 9 -0.009597 -2.060836  1.092711 -0.683582]

In [96]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.439092,-0.743855,-0.576979,0.627477
1,0.503925,0.782436,0.610047,-0.509657
2,-0.376753,-1.519034,0.608818,-1.504769
3,-0.816946,-0.413386,-0.080414,-0.557007
4,-0.785825,-0.6206,-0.228781,-0.304346
5,-0.795744,-1.378859,-0.100353,-0.102948
6,-0.529604,0.715059,0.287876,-0.235978
7,0.871331,0.250678,-2.043929,-1.269256
8,0.453958,1.254075,-2.073782,-1.549154
9,-0.009597,-2.060836,1.092711,-0.683582


#### Join (결합)
- [데이터베이스 스타일 결합](https://pandas.pydata.org/pandas-docs/stable/merging.html#merging-join)

In [104]:
left = pd.DataFrame({'key':['foo', 'foo'], 'lval':[1, 2]})
right = pd.DataFrame({'key':['foo', 'foo'], 'rval':[4, 5]})

In [105]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [106]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [107]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [108]:
left = pd.DataFrame({'key' : ['foo', 'bar'], 'lval' : [1, 2]})

In [109]:
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [110]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


#### Append (추가)
- [Appending](https://pandas.pydata.org/pandas-docs/stable/merging.html#merging-concatenation)

In [111]:
df = pd.DataFrame(np.random.randn(8, 4))

In [112]:
df

Unnamed: 0,0,1,2,3
0,-1.15591,0.26654,-1.369549,1.373718
1,-0.161371,-1.364177,0.60329,0.126016
2,-0.032484,-1.101715,0.424708,-0.488681
3,0.343003,0.581522,0.107044,-0.074213
4,-0.354831,0.316847,0.250564,0.064423
5,0.182911,0.987427,-0.763859,-0.123002
6,0.056706,0.664088,-0.455844,0.608939
7,2.74442,0.253498,-0.342662,-0.005601


In [113]:
s = df.iloc[3]

In [114]:
s

0    0.343003
1    0.581522
2    0.107044
3   -0.074213
Name: 3, dtype: float64

In [115]:
df.append(s, ignore_index=True)

Unnamed: 0,0,1,2,3
0,-1.15591,0.26654,-1.369549,1.373718
1,-0.161371,-1.364177,0.60329,0.126016
2,-0.032484,-1.101715,0.424708,-0.488681
3,0.343003,0.581522,0.107044,-0.074213
4,-0.354831,0.316847,0.250564,0.064423
5,0.182911,0.987427,-0.763859,-0.123002
6,0.056706,0.664088,-0.455844,0.608939
7,2.74442,0.253498,-0.342662,-0.005601
8,0.343003,0.581522,0.107044,-0.074213
