## Pandas Data Handling

#### DataFrame을 만들때 index, column을 설정하지 않으면 기본값으로 0부터 시작하는 정수형 숫자로 입력된다.

In [13]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,0.161643,-1.098121,1.422378,0.180083
1,-0.940113,0.391486,1.268527,1.148408
2,0.71286,-0.113038,-0.285802,1.167413
3,0.25971,1.017865,-0.311707,-2.011367
4,-0.922365,2.435044,0.094551,-0.365757
5,0.354974,-0.721282,0.020587,-0.36902


#### pandas에서 제공하는 date range함수는 datetime 자료형으로 구성된, 날짜 시각등을 알 수 있는 자료형을 만드는 함수임

In [14]:
df.columns = ['A', 'B', 'C', 'D']
df.index = pd.date_range('20190615',periods=6)
df.index

DatetimeIndex(['2019-06-15', '2019-06-16', '2019-06-17', '2019-06-18',
               '2019-06-19', '2019-06-20'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df

Unnamed: 0,A,B,C,D
2019-06-01,0.375607,-0.639742,0.165153,-1.019069
2019-06-02,-0.801693,-0.887069,0.587686,-1.149016
2019-06-03,1.126695,-1.592828,1.146674,0.055892
2019-06-04,0.675407,0.545745,-1.246102,-0.217656
2019-06-05,-1.631995,1.042206,1.34261,1.163074
2019-06-06,0.898472,0.725617,1.657554,-0.106878


#### np.nan은 NaN값을 의미한다.

In [15]:
df['F'] = [1.0, np.NaN, 3.5, 6.1, np.NaN, 7.0]
df

Unnamed: 0,A,B,C,D,F
2019-06-15,0.161643,-1.098121,1.422378,0.180083,1.0
2019-06-16,-0.940113,0.391486,1.268527,1.148408,
2019-06-17,0.71286,-0.113038,-0.285802,1.167413,3.5
2019-06-18,0.25971,1.017865,-0.311707,-2.011367,6.1
2019-06-19,-0.922365,2.435044,0.094551,-0.365757,
2019-06-20,0.354974,-0.721282,0.020587,-0.36902,7.0


### NaN 제거하기 
* 주의 : drop함수는 반환을 받지 않으면 기존의 DataFrame은 그대로임 
* 아니면, inplace=True라는 인자를 추가하여 반환을 받지 않고서도 기존의 DataFrame이 변경되도록 함

#### 행의 값 중 하나라도 nan인 경우 그 행을 제거함 

In [7]:
df

Unnamed: 0,A,B,C,D,F
2019-06-15,0.341683,1.286766,1.79532,0.223818,1.0
2019-06-16,0.012604,-0.183015,0.898572,-1.353051,
2019-06-17,-0.326078,0.597458,-1.846627,-2.00233,3.5
2019-06-18,-0.09761,0.146587,-1.508415,-1.04219,6.1
2019-06-19,-0.062145,0.359238,0.469134,-0.836363,
2019-06-20,0.886447,1.153205,0.699066,0.106765,7.0


In [10]:
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F
2019-06-15,0.341683,1.286766,1.79532,0.223818,1.0
2019-06-17,-0.326078,0.597458,-1.846627,-2.00233,3.5
2019-06-18,-0.09761,0.146587,-1.508415,-1.04219,6.1
2019-06-20,0.886447,1.153205,0.699066,0.106765,7.0


#### 행의 모든 값이 nan인 경우 그 행을 제거함 

In [11]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2019-06-15,0.341683,1.286766,1.79532,0.223818,1.0
2019-06-16,0.012604,-0.183015,0.898572,-1.353051,
2019-06-17,-0.326078,0.597458,-1.846627,-2.00233,3.5
2019-06-18,-0.09761,0.146587,-1.508415,-1.04219,6.1
2019-06-19,-0.062145,0.359238,0.469134,-0.836363,
2019-06-20,0.886447,1.153205,0.699066,0.106765,7.0


#### nan에 값 넣기

In [10]:
df.fillna(value=0.5)

Unnamed: 0,A,B,C,D,F
2019-06-01,0.375607,-0.639742,0.165153,-1.019069,1.0
2019-06-02,-0.801693,-0.887069,0.587686,-1.149016,0.5
2019-06-03,1.126695,-1.592828,1.146674,0.055892,3.5
2019-06-04,0.675407,0.545745,-1.246102,-0.217656,6.1
2019-06-05,-1.631995,1.042206,1.34261,1.163074,0.5
2019-06-06,0.898472,0.725617,1.657554,-0.106878,7.0


#### nan값인지 확인하기

In [11]:
.........

Unnamed: 0,A,B,C,D,F
2019-06-01,False,False,False,False,False
2019-06-02,False,False,False,False,True
2019-06-03,False,False,False,False,False
2019-06-04,False,False,False,False,False
2019-06-05,False,False,False,False,True
2019-06-06,False,False,False,False,False


In [12]:
df

Unnamed: 0,A,B,C,D,F
2019-06-01,0.375607,-0.639742,0.165153,-1.019069,1.0
2019-06-02,-0.801693,-0.887069,0.587686,-1.149016,
2019-06-03,1.126695,-1.592828,1.146674,0.055892,3.5
2019-06-04,0.675407,0.545745,-1.246102,-0.217656,6.1
2019-06-05,-1.631995,1.042206,1.34261,1.163074,
2019-06-06,0.898472,0.725617,1.657554,-0.106878,7.0


#### F열에서 nan값을 포함하는 행만 추출하기

In [13]:
.......

Unnamed: 0,A,B,C,D,F
2019-06-02,-0.801693,-0.887069,0.587686,-1.149016,
2019-06-05,-1.631995,1.042206,1.34261,1.163074,


#### 특정 행 drop하기

In [14]:
print(pd.to_datetime('20190602'))
df.____(pd._____('20190602'))

2019-06-02 00:00:00


Unnamed: 0,A,B,C,D,F
2019-06-01,0.375607,-0.639742,0.165153,-1.019069,1.0
2019-06-03,1.126695,-1.592828,1.146674,0.055892,3.5
2019-06-04,0.675407,0.545745,-1.246102,-0.217656,6.1
2019-06-05,-1.631995,1.042206,1.34261,1.163074,
2019-06-06,0.898472,0.725617,1.657554,-0.106878,7.0


In [16]:
# 2개 이상도 가능
df.drop([pd.to_datetime('20190602'),pd.to_datetime('20190604')])


Unnamed: 0,A,B,C,D,F
2019-06-01,0.375607,-0.639742,0.165153,-1.019069,1.0
2019-06-03,1.126695,-1.592828,1.146674,0.055892,3.5
2019-06-05,-1.631995,1.042206,1.34261,1.163074,
2019-06-06,0.898472,0.725617,1.657554,-0.106878,7.0


#### 특정 열 삭제하기

In [19]:
......

Unnamed: 0,A,B,C,D
2019-06-01,0.375607,-0.639742,0.165153,-1.019069
2019-06-02,-0.801693,-0.887069,0.587686,-1.149016
2019-06-03,1.126695,-1.592828,1.146674,0.055892
2019-06-04,0.675407,0.545745,-1.246102,-0.217656
2019-06-05,-1.631995,1.042206,1.34261,1.163074
2019-06-06,0.898472,0.725617,1.657554,-0.106878


In [22]:
# 2개 이상의 열도 가능
......

Unnamed: 0,A,C,F
2019-06-01,0.375607,0.165153,1.0
2019-06-02,-0.801693,0.587686,
2019-06-03,1.126695,1.146674,3.5
2019-06-04,0.675407,-1.246102,6.1
2019-06-05,-1.631995,1.34261,
2019-06-06,0.898472,1.657554,7.0
