# pandas 기초

In [1]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
        # 데이터 형을 구성하는 기본은 series
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range('20130101', periods=6)
            # 날짜를 이용할 수 있다.
                        # 시작 기준일
                                    # 일단위 기간(6일) 을 출력
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.361638,-0.261838,2.572423,1.603783
2013-01-02,-0.527637,-0.289769,0.031491,-0.562616
2013-01-03,0.034664,-0.715684,0.766917,0.683118
2013-01-04,-0.338523,-1.174786,0.349022,1.530604
2013-01-05,0.60416,0.791224,-0.716117,0.563651


In [10]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.info()   # DF의 기본정보 확인

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [14]:
df.describe()       # DF의 통계적 기본 정보를 확인

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.223413,-0.322987,0.591809,0.499927
std,0.48068,0.652284,1.099005,1.01852
min,-0.751501,-1.174786,-0.716117,-0.818975
25%,-0.486137,-0.609205,0.110874,-0.281049
50%,-0.350081,-0.288419,0.448069,0.623385
75%,-0.058633,-0.268145,0.711967,1.318732
max,0.60416,0.791224,2.572423,1.603783


In [16]:
# 데이터 정렬
df.sort_values(by='B', ascending=False)
                # 컬럼 기준 지정
                        # 내림차순 정렬

Unnamed: 0,A,B,C,D
2013-01-05,0.60416,0.791224,-0.716117,0.563651
2013-01-01,-0.361638,-0.261838,2.572423,1.603783
2013-01-06,-0.751501,-0.287068,0.547117,-0.818975
2013-01-02,-0.527637,-0.289769,0.031491,-0.562616
2013-01-03,0.034664,-0.715684,0.766917,0.683118
2013-01-04,-0.338523,-1.174786,0.349022,1.530604


## pandas slice

In [17]:
# 특정 컬럼 읽기
df['A']

2013-01-01   -0.361638
2013-01-02   -0.527637
2013-01-03    0.034664
2013-01-04   -0.338523
2013-01-05    0.604160
2013-01-06   -0.751501
Freq: D, Name: A, dtype: float64

In [19]:
df[0:3]     # 0 ~ 2 줄까지의 데이터를 나타낸다.

Unnamed: 0,A,B,C,D
2013-01-01,-0.361638,-0.261838,2.572423,1.603783
2013-01-02,-0.527637,-0.289769,0.031491,-0.562616
2013-01-03,0.034664,-0.715684,0.766917,0.683118


In [23]:
# 특정 인덱스 범위의 데이터를 나타낸다.
df['20130102' : '20130104']     # 끝을 포함한다.

Unnamed: 0,A,B,C,D
2013-01-02,-0.527637,-0.289769,0.031491,-0.562616
2013-01-03,0.034664,-0.715684,0.766917,0.683118
2013-01-04,-0.338523,-1.174786,0.349022,1.530604


- loc 슬라이싱

In [24]:
df.loc[:, ['A','B']]
       # 모든 행의
            # 'A','B' 열을 선택

Unnamed: 0,A,B
2013-01-01,-0.361638,-0.261838
2013-01-02,-0.527637,-0.289769
2013-01-03,0.034664,-0.715684
2013-01-04,-0.338523,-1.174786
2013-01-05,0.60416,0.791224
2013-01-06,-0.751501,-0.287068


In [27]:
df.loc['20130102' : '20130104',['A','B']]
        # 특정 인덱스 범위의 데이터에서
                                # 'A','B' 컬럼 데이터들을 나타낸다.

Unnamed: 0,A,B
2013-01-02,-0.527637,-0.289769
2013-01-03,0.034664,-0.715684
2013-01-04,-0.338523,-1.174786


In [28]:
df.loc['20130102', ['A','B']]

A   -0.527637
B   -0.289769
Name: 2013-01-02 00:00:00, dtype: float64

- iloc 슬라이싱
> iloc 옵션을 이용해서 번호로만 접근

In [30]:
df.iloc[3]

A   -0.338523
B   -1.174786
C    0.349022
D    1.530604
Name: 2013-01-04 00:00:00, dtype: float64

In [31]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.338523,-1.174786
2013-01-05,0.60416,0.791224


In [32]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.527637,0.031491
2013-01-03,0.034664,0.766917
2013-01-05,0.60416,-0.716117


In [33]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.261838,2.572423
2013-01-02,-0.289769,0.031491
2013-01-03,-0.715684,0.766917
2013-01-04,-1.174786,0.349022
2013-01-05,0.791224,-0.716117
2013-01-06,-0.287068,0.547117


## pandas 조건문 slicing

In [35]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.361638,-0.261838,2.572423,1.603783
2013-01-02,-0.527637,-0.289769,0.031491,-0.562616
2013-01-03,0.034664,-0.715684,0.766917,0.683118
2013-01-04,-0.338523,-1.174786,0.349022,1.530604
2013-01-05,0.60416,0.791224,-0.716117,0.563651
2013-01-06,-0.751501,-0.287068,0.547117,-0.818975


In [34]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-03,0.034664,-0.715684,0.766917,0.683118
2013-01-05,0.60416,0.791224,-0.716117,0.563651


In [36]:
df[df>0]
    # 0 보다 작은 것은 NaN 처리 된다.

Unnamed: 0,A,B,C,D
2013-01-01,,,2.572423,1.603783
2013-01-02,,,0.031491,
2013-01-03,0.034664,,0.766917,0.683118
2013-01-04,,,0.349022,1.530604
2013-01-05,0.60416,0.791224,,0.563651
2013-01-06,,,0.547117,


In [38]:
# 컬럼 및 데이터 추가하기
df['E'] = ['one','one','two','three','four','three']
df

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.361638,-0.261838,2.572423,1.603783,one
2013-01-02,-0.527637,-0.289769,0.031491,-0.562616,one
2013-01-03,0.034664,-0.715684,0.766917,0.683118,two
2013-01-04,-0.338523,-1.174786,0.349022,1.530604,three
2013-01-05,0.60416,0.791224,-0.716117,0.563651,four
2013-01-06,-0.751501,-0.287068,0.547117,-0.818975,three


In [43]:
# 특정 요소가 있는지 확인
df['E'].isin(['two','four'])

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

In [45]:
# 특정 요소가 있는 행만 선택
df[df['E'].isin(['two','four'])]
    #[df['E'].isin(['two','four'])] 에 해당하는 index값을 반환하면
# df[] 로 데이터를 출력하게 된다.

Unnamed: 0,A,B,C,D,E
2013-01-03,0.034664,-0.715684,0.766917,0.683118,two
2013-01-05,0.60416,0.791224,-0.716117,0.563651,four


## 특정 컬럼 제거
- del 사용

In [46]:
del df['E']
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.361638,-0.261838,2.572423,1.603783
2013-01-02,-0.527637,-0.289769,0.031491,-0.562616
2013-01-03,0.034664,-0.715684,0.766917,0.683118
2013-01-04,-0.338523,-1.174786,0.349022,1.530604
2013-01-05,0.60416,0.791224,-0.716117,0.563651
2013-01-06,-0.751501,-0.287068,0.547117,-0.818975


## 각 컬럼 누적합

In [47]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,-0.361638,-0.261838,2.572423,1.603783
2013-01-02,-0.889275,-0.551606,2.603914,1.041167
2013-01-03,-0.854611,-1.26729,3.370831,1.724284
2013-01-04,-1.193135,-2.442076,3.719853,3.254888
2013-01-05,-0.588975,-1.650852,3.003736,3.81854
2013-01-06,-1.340475,-1.937921,3.550853,2.999565
