#  Pandas 데이터 처리

## 갯수 세기

In [2]:
import numpy as np
import pandas as pd

In [12]:
test_01 = pd.DataFrame(np.random.randint(5,size=(4,4)))
test_01

Unnamed: 0,0,1,2,3
0,0,3,1,2
1,0,4,1,2
2,2,1,0,1
3,3,4,3,1


In [16]:
np.random.seed(1)
test_01 = pd.DataFrame(np.random.randint(5,size=(4,5)))
test_01

Unnamed: 0,0,1,2,3,4
0,3,4,0,1,3
1,0,0,1,4,4
2,1,2,4,2,4
3,3,4,2,4,2


[random.seed 자료 보기](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.html#numpy.random.RandomState)


In [17]:
test_01.count()

0    4
1    4
2    4
3    4
4    4
dtype: int64

## 카테고리값 세기

In [18]:
np.random.seed(1)
test_02 = pd.Series(np.random.randint(6, size=10))
test_02

0    5
1    3
2    4
3    0
4    1
5    3
6    5
7    0
8    0
9    1
dtype: int32

** pandas.DataFrame.tail **
- [tail() 자료 보기](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.tail.html)
- Returns last n rows
- Default: n=5

In [22]:
test_02.tail()

5    3
6    5
7    0
8    0
9    1
dtype: int32

** pandas.DataFrame.sort_index **
- [sort_index() 자료 보기](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_index.html)
- Sort object by labels (along an axis)
- sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None)

In [23]:
test_02.sort_index(ascending=True)

0    5
1    3
2    4
3    0
4    1
5    3
6    5
7    0
8    0
9    1
dtype: int32

## 행/열 합계

In [25]:
np.random.seed(1)
test_02 = pd.DataFrame(np.random.randint(10, size=(4,8)))
test_02

Unnamed: 0,0,1,2,3,4,5,6,7
0,5,8,9,5,0,0,1,7
1,6,9,2,4,5,2,4,2
2,4,7,7,9,1,7,0,6
3,9,9,7,6,9,1,0,1


In [30]:
test_02.sum(axis=1)

0    35
1    34
2    41
3    42
dtype: int64

In [31]:
# 행 합계 후, 새로운 행 추가하기
test_02['sum'] = test_02.sum(axis=1)
test_02

Unnamed: 0,0,1,2,3,4,5,6,7,sum
0,5,8,9,5,0,0,1,7,35
1,6,9,2,4,5,2,4,2,34
2,4,7,7,9,1,7,0,6,41
3,9,9,7,6,9,1,0,1,42


In [32]:
test_02.sum()

0       24
1       33
2       25
3       24
4       15
5       10
6        5
7       16
sum    152
dtype: int64

In [37]:
# 열 합계 후, 새로운 열 추가하기
test_02.loc['total',:] = test_02.sum()
test_02

Unnamed: 0,0,1,2,3,4,5,6,7,sum
0,5.0,8.0,9.0,5.0,0.0,0.0,1.0,7.0,35.0
1,6.0,9.0,2.0,4.0,5.0,2.0,4.0,2.0,34.0
2,4.0,7.0,7.0,9.0,1.0,7.0,0.0,6.0,41.0
3,9.0,9.0,7.0,6.0,9.0,1.0,0.0,1.0,42.0
total,96.0,132.0,100.0,96.0,60.0,40.0,20.0,64.0,608.0


In [40]:
test_02.loc['total']

0       96.0
1      132.0
2      100.0
3       96.0
4       60.0
5       40.0
6       20.0
7       64.0
sum    608.0
Name: total, dtype: float64

In [34]:
test_02.loc[:,1]

0         8.0
1         9.0
2         7.0
3         9.0
total    33.0
Name: 1, dtype: float64

## Apply() Method

** pandas.apply() **
- [더 알아보기](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html)
- Applies function along input axis of DataFrame
- Objects passed to functions are Series objects having index either the DataFrame’s index (axis=0) or the columns (axis=1). Return type depends on whether passed function aggregates, or the reduce argument if the DataFrame is empty.

In [45]:
apply_test_01 = pd.DataFrame({
        'A': [1, 3, 4, 3, 4],
        'B': [2, 3, 1, 2, 3],
        'C': [1, 5, 2, 4, 4]
    })
apply_test_01

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [46]:
apply_test_01.max()

A    4
B    3
C    5
dtype: int64

** lamda **
* [lamda 알아보기](http://www.secnetix.de/olli/Python/lambda_functions.hawk)
* [lamda 예시](https://stackoverflow.com/questions/10345278/understanding-lambda-in-python-and-using-it-to-pass-multiple-arguments)

In [33]:
# 열(columns)에 적용하기
apply_test_01.apply(lambda x: x.max() - x.min())

A    3
B    2
C    4
dtype: int64

In [34]:
# 행(row or index)에 적용하기
apply_test_01.apply(lambda x: x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [48]:
# value_counts를 이용하여 각 행 혹은 열에 사용되는 값의 수 보기
apply_test_01.apply(pd.value_counts, axis=1)

Unnamed: 0,1,2,3,4,5
0,2.0,1.0,,,
1,,,2.0,,1.0
2,1.0,1.0,,1.0,
3,,1.0,1.0,1.0,
4,,,1.0,2.0,


In [41]:
# fillna()를 이용하여 Nan값에 0 지정하기
apply_test_01.apply(pd.value_counts, axis=1).fillna(0)

Unnamed: 0,1,2,3,4,5
0,2.0,1.0,0.0,0.0,0.0
1,0.0,0.0,2.0,0.0,1.0
2,1.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,1.0,0.0
4,0.0,0.0,1.0,2.0,0.0
