# Pandas Advanced
> INDEX
- Object Creation (객체 생성)
- Viewing Data (데이터 확인하기)
- Selection (선택)
- Missing Data (결측치)
- Operation (연산)

## 1. Object Creation (객체 생성)
- 데이터 구조 섹션 참조
- Pandas는 값을 가지고 있는 리스트를 통해 Series를 만들고,
정수로 만들어진 인덱스를 기본값으로 불러올 것입니다.

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20220601', periods = 6)
dates

DatetimeIndex(['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04',
               '2022-06-05', '2022-06-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853
2022-06-06,-0.239616,0.574096,-1.303101,1.146494


In [8]:
df2 = pd.DataFrame(
        {
            'A' : 1.0,
            'B' : pd.Timestamp('20220601'),
            'C' : pd.Series(1, index = list(range(4)), dtype = 'float32'),
            'D' : np.array([3] * 4, dtype = 'int32'),
            'E' : pd.Categorical(['JB_Bank','KJ_Bank','JB_Bank','KJ_Bank']),
            'F' : 'BigpyCraft'
        }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2022-06-01,1.0,3,JB_Bank,BigpyCraft
1,1.0,2022-06-01,1.0,3,KJ_Bank,BigpyCraft
2,1.0,2022-06-01,1.0,3,JB_Bank,BigpyCraft
3,1.0,2022-06-01,1.0,3,KJ_Bank,BigpyCraft


In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### 2. Viewing Data (데이터 확인하기)

In [14]:
df.head()

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853


In [15]:
df.tail()

Unnamed: 0,A,B,C,D
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853
2022-06-06,-0.239616,0.574096,-1.303101,1.146494


In [16]:
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853
2022-06-06,-0.239616,0.574096,-1.303101,1.146494


In [18]:
df.index

DatetimeIndex(['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04',
               '2022-06-05', '2022-06-06'],
              dtype='datetime64[ns]', freq='D')

In [19]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [20]:
df.values

array([[ 1.01668243, -0.85396188, -0.59675174, -0.31414842],
       [-1.23656085, -0.2118936 , -0.72750857,  0.84054264],
       [ 0.12425178, -1.20408274, -1.09850733, -1.04996152],
       [ 0.49956043, -1.27993974,  0.49184257, -0.53271789],
       [ 0.29631175,  0.53184959, -0.68128133, -0.97985308],
       [-0.2396157 ,  0.57409604, -1.30310115,  1.14649425]])

In [21]:
df.value_counts

<bound method DataFrame.value_counts of                    A         B         C         D
2022-06-01  1.016682 -0.853962 -0.596752 -0.314148
2022-06-02 -1.236561 -0.211894 -0.727509  0.840543
2022-06-03  0.124252 -1.204083 -1.098507 -1.049962
2022-06-04  0.499560 -1.279940  0.491843 -0.532718
2022-06-05  0.296312  0.531850 -0.681281 -0.979853
2022-06-06 -0.239616  0.574096 -1.303101  1.146494>

In [22]:
df

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853
2022-06-06,-0.239616,0.574096,-1.303101,1.146494


In [24]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.076772,-0.407322,-0.652551,-0.148274
std,0.766496,0.834169,0.623103,0.931027
min,-1.236561,-1.27994,-1.303101,-1.049962
25%,-0.148649,-1.116553,-1.005758,-0.868069
50%,0.210282,-0.532928,-0.704395,-0.423433
75%,0.448748,0.345914,-0.617884,0.55187
max,1.016682,0.574096,0.491843,1.146494


In [25]:
df.T

Unnamed: 0,2022-06-01,2022-06-02,2022-06-03,2022-06-04,2022-06-05,2022-06-06
A,1.016682,-1.236561,0.124252,0.49956,0.296312,-0.239616
B,-0.853962,-0.211894,-1.204083,-1.27994,0.53185,0.574096
C,-0.596752,-0.727509,-1.098507,0.491843,-0.681281,-1.303101
D,-0.314148,0.840543,-1.049962,-0.532718,-0.979853,1.146494


In [26]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2022-06-01,-0.314148,-0.596752,-0.853962,1.016682
2022-06-02,0.840543,-0.727509,-0.211894,-1.236561
2022-06-03,-1.049962,-1.098507,-1.204083,0.124252
2022-06-04,-0.532718,0.491843,-1.27994,0.49956
2022-06-05,-0.979853,-0.681281,0.53185,0.296312
2022-06-06,1.146494,-1.303101,0.574096,-0.239616


In [27]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-05,0.296312,0.53185,-0.681281,-0.979853
2022-06-06,-0.239616,0.574096,-1.303101,1.146494


### 3. Selection (선택)
- Getting (데이터 얻기)

In [28]:
df['A']

2022-06-01    1.016682
2022-06-02   -1.236561
2022-06-03    0.124252
2022-06-04    0.499560
2022-06-05    0.296312
2022-06-06   -0.239616
Freq: D, Name: A, dtype: float64

In [29]:
df[1:4]

Unnamed: 0,A,B,C,D
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718


In [30]:
df['20220601':'20220605']

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853


In [31]:
df.loc[dates[0]]

A    1.016682
B   -0.853962
C   -0.596752
D   -0.314148
Name: 2022-06-01 00:00:00, dtype: float64

In [32]:
df.loc[dates[0],'A']

1.0166824276124056

In [33]:
df.loc['20220601', ['A','B']]

A    1.016682
B   -0.853962
Name: 2022-06-01 00:00:00, dtype: float64

In [34]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2022-06-01,1.016682,-0.853962
2022-06-02,-1.236561,-0.211894
2022-06-03,0.124252,-1.204083
2022-06-04,0.49956,-1.27994
2022-06-05,0.296312,0.53185
2022-06-06,-0.239616,0.574096


In [35]:
df.loc['20220601':'20220603',['A','B']]

Unnamed: 0,A,B
2022-06-01,1.016682,-0.853962
2022-06-02,-1.236561,-0.211894
2022-06-03,0.124252,-1.204083


In [36]:
df.iloc[3]

A    0.499560
B   -1.279940
C    0.491843
D   -0.532718
Name: 2022-06-04 00:00:00, dtype: float64

In [37]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2022-06-02,-1.236561,-0.727509
2022-06-03,0.124252,-1.098507
2022-06-05,0.296312,-0.681281


In [38]:
df.iloc[1:3,1:3]

Unnamed: 0,B,C
2022-06-02,-0.211894,-0.727509
2022-06-03,-1.204083,-1.098507


In [39]:
df.iloc[1,1]

-0.21189359664847437

In [40]:
df.iat[1,1]

-0.21189359664847437

- Boolean Indexing

In [41]:
df

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853
2022-06-06,-0.239616,0.574096,-1.303101,1.146494


In [42]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853


In [43]:
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,,,
2022-06-02,,,,0.840543
2022-06-03,0.124252,,,
2022-06-04,0.49956,,0.491843,
2022-06-05,0.296312,0.53185,,
2022-06-06,,0.574096,,1.146494


In [44]:
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962
2022-06-04,0.49956,-1.27994,0.491843,-0.532718
2022-06-05,0.296312,0.53185,-0.681281,-0.979853
2022-06-06,-0.239616,0.574096,-1.303101,1.146494


In [48]:
df is df2

False

In [49]:
df2['E'] = ['one','one','two','three','four','three']

In [50]:
df2

Unnamed: 0,A,B,C,D,E
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148,one
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543,one
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962,two
2022-06-04,0.49956,-1.27994,0.491843,-0.532718,three
2022-06-05,0.296312,0.53185,-0.681281,-0.979853,four
2022-06-06,-0.239616,0.574096,-1.303101,1.146494,three


In [51]:
# df2[df2['E'].isin(['two','four'])]   # 같은 결과
df2[df2.E.isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962,two
2022-06-05,0.296312,0.53185,-0.681281,-0.979853,four


- Setting (설정)

In [52]:
s1 = pd.Series(range(1,7), index = pd.date_range('20220601', periods = 6))
s1   # 특정 변수 추가!!!

2022-06-01    1
2022-06-02    2
2022-06-03    3
2022-06-04    4
2022-06-05    5
2022-06-06    6
Freq: D, dtype: int64

In [53]:
df['F'] = s1

In [54]:
df

Unnamed: 0,A,B,C,D,F
2022-06-01,1.016682,-0.853962,-0.596752,-0.314148,1
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543,2
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962,3
2022-06-04,0.49956,-1.27994,0.491843,-0.532718,4
2022-06-05,0.296312,0.53185,-0.681281,-0.979853,5
2022-06-06,-0.239616,0.574096,-1.303101,1.146494,6


In [55]:
dates[0]

Timestamp('2022-06-01 00:00:00', freq='D')

In [56]:
# 값 변경 방법 1) 라벨에 의해 값 설정
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,-0.853962,-0.596752,-0.314148,1
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543,2
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962,3
2022-06-04,0.49956,-1.27994,0.491843,-0.532718,4
2022-06-05,0.296312,0.53185,-0.681281,-0.979853,5
2022-06-06,-0.239616,0.574096,-1.303101,1.146494,6


In [57]:
# 값 변경 방법 2) 위치에 의해 값 설정   -   이 방법이 더 편한듯 하다!!!!!!!!!!!!!!!!!!!!!!!!!!!!
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,-0.314148,1
2022-06-02,-1.236561,-0.211894,-0.727509,0.840543,2
2022-06-03,0.124252,-1.204083,-1.098507,-1.049962,3
2022-06-04,0.49956,-1.27994,0.491843,-0.532718,4
2022-06-05,0.296312,0.53185,-0.681281,-0.979853,5
2022-06-06,-0.239616,0.574096,-1.303101,1.146494,6


In [58]:
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,5,1
2022-06-02,-1.236561,-0.211894,-0.727509,5,2
2022-06-03,0.124252,-1.204083,-1.098507,5,3
2022-06-04,0.49956,-1.27994,0.491843,5,4
2022-06-05,0.296312,0.53185,-0.681281,5,5
2022-06-06,-0.239616,0.574096,-1.303101,5,6


In [59]:
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,5,1
2022-06-02,-1.236561,-0.211894,-0.727509,5,2
2022-06-03,0.124252,-1.204083,-1.098507,5,3
2022-06-04,0.49956,-1.27994,0.491843,5,4
2022-06-05,0.296312,0.53185,-0.681281,5,5
2022-06-06,-0.239616,0.574096,-1.303101,5,6


In [60]:
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,-5,-1
2022-06-02,-1.236561,-0.211894,-0.727509,-5,-2
2022-06-03,-0.124252,-1.204083,-1.098507,-5,-3
2022-06-04,-0.49956,-1.27994,-0.491843,-5,-4
2022-06-05,-0.296312,-0.53185,-0.681281,-5,-5
2022-06-06,-0.239616,-0.574096,-1.303101,-5,-6


### 4. Missing Data (결측치)
- Pandas는 결측치를 표현하기 위해 주로 np.nan 값을 사용합니다.
이 방법은 기본 설정값이지만 계산에는 포함되지 않습니다.
Missing data section을 참조
- Reindexing으로 지정된 축 상의 인덱스를 변경 / 추가 / 삭제할 수 있습니다.
Reindexing은 데이터의 복사본을 반환합니다.

In [61]:
df

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,5,1
2022-06-02,-1.236561,-0.211894,-0.727509,5,2
2022-06-03,0.124252,-1.204083,-1.098507,5,3
2022-06-04,0.49956,-1.27994,0.491843,5,4
2022-06-05,0.296312,0.53185,-0.681281,5,5
2022-06-06,-0.239616,0.574096,-1.303101,5,6


In [63]:
columns = list(df.columns) + ['E']
columns

['A', 'B', 'C', 'D', 'F', 'E']

In [64]:
df1 = df.reindex(index = dates[0:4], columns = columns)
df1

Unnamed: 0,A,B,C,D,F,E
2022-06-01,0.0,0.0,-0.596752,5,1,
2022-06-02,-1.236561,-0.211894,-0.727509,5,2,
2022-06-03,0.124252,-1.204083,-1.098507,5,3,
2022-06-04,0.49956,-1.27994,0.491843,5,4,


In [65]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2022-06-01,0.0,0.0,-0.596752,5,1,1.0
2022-06-02,-1.236561,-0.211894,-0.727509,5,2,1.0
2022-06-03,0.124252,-1.204083,-1.098507,5,3,
2022-06-04,0.49956,-1.27994,0.491843,5,4,


In [66]:
# 결측치 행들에 대한 처리
df1.dropna(how = 'any')

Unnamed: 0,A,B,C,D,F,E
2022-06-01,0.0,0.0,-0.596752,5,1,1.0
2022-06-02,-1.236561,-0.211894,-0.727509,5,2,1.0


In [67]:
df1.fillna(value = 5)

Unnamed: 0,A,B,C,D,F,E
2022-06-01,0.0,0.0,-0.596752,5,1,1.0
2022-06-02,-1.236561,-0.211894,-0.727509,5,2,1.0
2022-06-03,0.124252,-1.204083,-1.098507,5,3,5.0
2022-06-04,0.49956,-1.27994,0.491843,5,4,5.0


In [68]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2022-06-01,False,False,False,False,False,False
2022-06-02,False,False,False,False,False,False
2022-06-03,False,False,False,False,False,True
2022-06-04,False,False,False,False,False,True


### 5. Operation (연산)
> Stats (통계)

- 일반적으로 결측치를 제외한 후 연산됩니다.
- 기술통계를 수행합니다.

In [69]:
df

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,5,1
2022-06-02,-1.236561,-0.211894,-0.727509,5,2
2022-06-03,0.124252,-1.204083,-1.098507,5,3
2022-06-04,0.49956,-1.27994,0.491843,5,4
2022-06-05,0.296312,0.53185,-0.681281,5,5
2022-06-06,-0.239616,0.574096,-1.303101,5,6


In [70]:
df.mean()

A   -0.092675
B   -0.264995
C   -0.652551
D    5.000000
F    3.500000
dtype: float64

In [72]:
df.mean(1)

2022-06-01    1.080650
2022-06-02    0.964807
2022-06-03    1.164332
2022-06-04    1.742293
2022-06-05    2.029376
2022-06-06    2.006276
Freq: D, dtype: float64

In [74]:
s = pd.Series([1,3,5,np.nan,6,8], index = dates).shift(2)   # .shift() : ~칸 만큼 이동
s

2022-06-01    NaN
2022-06-02    NaN
2022-06-03    1.0
2022-06-04    3.0
2022-06-05    5.0
2022-06-06    NaN
Freq: D, dtype: float64

In [75]:
df.sub(s, axis = 'index')

Unnamed: 0,A,B,C,D,F
2022-06-01,,,,,
2022-06-02,,,,,
2022-06-03,-0.875748,-2.204083,-2.098507,4.0,2.0
2022-06-04,-2.50044,-4.27994,-2.508157,2.0,1.0
2022-06-05,-4.703688,-4.46815,-5.681281,0.0,0.0
2022-06-06,,,,,


In [76]:
df

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,5,1
2022-06-02,-1.236561,-0.211894,-0.727509,5,2
2022-06-03,0.124252,-1.204083,-1.098507,5,3
2022-06-04,0.49956,-1.27994,0.491843,5,4
2022-06-05,0.296312,0.53185,-0.681281,5,5
2022-06-06,-0.239616,0.574096,-1.303101,5,6


In [77]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2022-06-01,0.0,0.0,-0.596752,5,1
2022-06-02,-1.236561,-0.211894,-1.32426,10,3
2022-06-03,-1.112309,-1.415976,-2.422768,15,6
2022-06-04,-0.612749,-2.695916,-1.930925,20,10
2022-06-05,-0.316437,-2.164066,-2.612206,25,15
2022-06-06,-0.556053,-1.58997,-3.915308,30,21


In [78]:
df.apply(lambda x: x.max() - x.min())

A    1.736121
B    1.854036
C    1.794944
D    0.000000
F    5.000000
dtype: float64

### Histogramming (히스토그래밍)

- 도수분포표를 그래프로 나타낸 것
- 즉, 표로 되어 있는 도수 분포를 정보 그림으로 나타낸 것

In [82]:
s = pd.Series(np.random.randint(1,7, size = 10))
s

0    2
1    6
2    6
3    2
4    4
5    4
6    4
7    2
8    2
9    1
dtype: int32

In [83]:
s.value_counts()

2    4
4    3
6    2
1    1
dtype: int64

>String Methods (문자열 메소드)

- Series는 다음의 코드와 같이 문자열 처리 메소드 모음 (set)을 가지고 있습니다.
이 모음은 배열의 각 요소를 쉽게 조작할 수 있도록 만들어주는 문자열의 속성에 포함되어 있습니다.
- 문자열의 패턴 일치 확인은 기본적으로 정규 표현식을 사용하며, 몇몇 경우에는 항상 정규 표현식을 사용함에 유의하십시오.
- 좀 더 자세한 내용은 벡터화된 문자열 메소드 부분에서 확인할 수 있습니다.

In [84]:
s = pd.Series(['A','B','C','Aaba','Baca',np.nan,'CABA','dog','cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [85]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [86]:
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

In [87]:
s.str.swapcase()   # 대소문자 변경

0       a
1       b
2       c
3    aABA
4    bACA
5     NaN
6    caba
7     DOG
8     CAT
dtype: object