# pandas

- 사이트
- [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html)

In [68]:
import pandas as pd
import numpy as np

## Pandas Series 

- Pandas Series like a columns in a Table
- It is a one-dimensional array holding data of any type.

In [69]:
d = [1, 2, 'c']
ser = pd.Series(data=d)
ser

0    1
1    2
2    c
dtype: object

In [70]:
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['a', 'b', 'c'])
ser

a    1
b    2
c    3
dtype: int64

In [71]:
dates = pd.date_range('20230101', periods=6)
dates

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

## pandas DataFrame

- A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [72]:
# 6rows, 4cols array(=6 by 4 array) : shaped floating-point 
np.random.randn(6, 4)

array([[ 0.15443702, -0.66550288, -0.7764728 ,  1.06500526],
       [-1.18621741,  0.39633308,  0.98580476,  1.12955402],
       [ 3.63172797, -0.13568625, -0.38281244,  2.29434068],
       [ 0.02962137,  1.52552808,  0.80056558, -0.98455915],
       [ 0.21780752,  1.0978452 , -0.01511833,  0.41818727],
       [ 0.2282557 ,  2.78066149,  0.51777441,  0.14982216]])

In [73]:
# create DataFrame with values(=6by4 array), index, columns
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.955547,0.298039,-1.146234,0.758017
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724
2023-01-03,0.081839,2.604461,1.393615,-0.153449
2023-01-04,-0.828507,-1.883435,-1.204261,-0.143998
2023-01-05,-0.400983,1.382989,-0.007788,0.030533
2023-01-06,0.013255,0.520081,1.725528,-1.667882


### DataFrame 구조

In [74]:
type(df)

pandas.core.frame.DataFrame

In [75]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [76]:
df.index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [77]:
df.values

array([[-0.95554651,  0.29803933, -1.14623427,  0.75801718],
       [-0.34533564, -0.86525009, -1.88718101,  0.29572382],
       [ 0.08183873,  2.60446121,  1.39361502, -0.15344864],
       [-0.82850654, -1.88343548, -1.20426088, -0.143998  ],
       [-0.4009831 ,  1.38298889, -0.00778814,  0.03053343],
       [ 0.01325469,  0.52008131,  1.72552764, -1.66788161]])

### 기초 활용 함수

In [78]:
# 개요보기
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.40588,0.342814,-0.18772,-0.146842
std,0.423788,1.590349,1.485516,0.819721
min,-0.955547,-1.883435,-1.887181,-1.667882
25%,-0.721626,-0.574428,-1.189754,-0.151086
50%,-0.373159,0.40906,-0.577011,-0.056732
75%,-0.076393,1.167262,1.043264,0.229426
max,0.081839,2.604461,1.725528,0.758017


In [79]:
# 정렬기능 , 'A' 컬럼을 기준으로 오름차순정렬
df.sort_values(by='A')  # defaults = ascending sort

Unnamed: 0,A,B,C,D
2023-01-01,-0.955547,0.298039,-1.146234,0.758017
2023-01-04,-0.828507,-1.883435,-1.204261,-0.143998
2023-01-05,-0.400983,1.382989,-0.007788,0.030533
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724
2023-01-06,0.013255,0.520081,1.725528,-1.667882
2023-01-03,0.081839,2.604461,1.393615,-0.153449


In [80]:
df.sort_values(by='A', ascending=False)  # desconding sort

Unnamed: 0,A,B,C,D
2023-01-03,0.081839,2.604461,1.393615,-0.153449
2023-01-06,0.013255,0.520081,1.725528,-1.667882
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724
2023-01-05,-0.400983,1.382989,-0.007788,0.030533
2023-01-04,-0.828507,-1.883435,-1.204261,-0.143998
2023-01-01,-0.955547,0.298039,-1.146234,0.758017


### DataFrame Slice

#### Columns name

In [81]:
df['A']

2023-01-01   -0.955547
2023-01-02   -0.345336
2023-01-03    0.081839
2023-01-04   -0.828507
2023-01-05   -0.400983
2023-01-06    0.013255
Freq: D, Name: A, dtype: float64

In [82]:
type(df['A'])

pandas.core.series.Series

#### Rows index

In [83]:
df[0:3]

Unnamed: 0,A,B,C,D
2023-01-01,-0.955547,0.298039,-1.146234,0.758017
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724
2023-01-03,0.081839,2.604461,1.393615,-0.153449


#### loc[]

- Access a group of rows and columns by label(s) or a boolean array
- loc[] is primarily label based

In [84]:
# loc[] is primarily label based
df.loc['2023-01-01']  # row label is '2023-01-01'

A   -0.955547
B    0.298039
C   -1.146234
D    0.758017
Name: 2023-01-01 00:00:00, dtype: float64

In [85]:
df.loc['2023-01-01', ['A']]  # [row label, column label]

A   -0.955547
Name: 2023-01-01 00:00:00, dtype: float64

In [86]:
df.loc['2023-01-01', ['A', 'B']]

A   -0.955547
B    0.298039
Name: 2023-01-01 00:00:00, dtype: float64

In [87]:
df.loc['2023-01-01':'2023-01-05', ['A', 'B']]

Unnamed: 0,A,B
2023-01-01,-0.955547,0.298039
2023-01-02,-0.345336,-0.86525
2023-01-03,0.081839,2.604461
2023-01-04,-0.828507,-1.883435
2023-01-05,-0.400983,1.382989


In [88]:
df.loc[:, ['A', 'B']] # `:` is all rows

Unnamed: 0,A,B
2023-01-01,-0.955547,0.298039
2023-01-02,-0.345336,-0.86525
2023-01-03,0.081839,2.604461
2023-01-04,-0.828507,-1.883435
2023-01-05,-0.400983,1.382989
2023-01-06,0.013255,0.520081


#### iloc[]

- Purely integer-location based indexing for selection by position.
- iloc[] is primarily integer position based

In [89]:
# iloc[] is primarily integer position based
df.iloc[0:2] # 0, 1 row index

Unnamed: 0,A,B,C,D
2023-01-01,-0.955547,0.298039,-1.146234,0.758017
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724


In [90]:
df.iloc[[0, 1, 2], 0:2] # [rows index postition, cols index postition] 

Unnamed: 0,A,B
2023-01-01,-0.955547,0.298039
2023-01-02,-0.345336,-0.86525
2023-01-03,0.081839,2.604461


In [91]:
df.iloc[[0,1,2]]

Unnamed: 0,A,B,C,D
2023-01-01,-0.955547,0.298039,-1.146234,0.758017
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724
2023-01-03,0.081839,2.604461,1.393615,-0.153449


### DataFrame 내에서 조건문 사용

In [92]:
df['A']>0

2023-01-01    False
2023-01-02    False
2023-01-03     True
2023-01-04    False
2023-01-05    False
2023-01-06     True
Freq: D, Name: A, dtype: bool

In [93]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2023-01-03,0.081839,2.604461,1.393615,-0.153449
2023-01-06,0.013255,0.520081,1.725528,-1.667882


In [94]:
df>0

Unnamed: 0,A,B,C,D
2023-01-01,False,True,False,True
2023-01-02,False,False,False,True
2023-01-03,True,True,True,False
2023-01-04,False,False,False,False
2023-01-05,False,True,False,True
2023-01-06,True,True,True,False


In [95]:
df[df>0]

Unnamed: 0,A,B,C,D
2023-01-01,,0.298039,,0.758017
2023-01-02,,,,0.295724
2023-01-03,0.081839,2.604461,1.393615,
2023-01-04,,,,
2023-01-05,,1.382989,,0.030533
2023-01-06,0.013255,0.520081,1.725528,


### 행과 열의 변경

In [96]:
df2 = pd.DataFrame(df, columns=['A', 'C', 'E', 'F'])  # column 'B' -> 'C'
df2

Unnamed: 0,A,C,E,F
2023-01-01,-0.955547,-1.146234,,
2023-01-02,-0.345336,-1.887181,,
2023-01-03,0.081839,1.393615,,
2023-01-04,-0.828507,-1.204261,,
2023-01-05,-0.400983,-0.007788,,
2023-01-06,0.013255,1.725528,,


In [97]:
# 'E' 열 생성 후 값을 NaN(not a number)로 채움
df['E'] = np.nan
df

Unnamed: 0,A,B,C,D,E
2023-01-01,-0.955547,0.298039,-1.146234,0.758017,
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724,
2023-01-03,0.081839,2.604461,1.393615,-0.153449,
2023-01-04,-0.828507,-1.883435,-1.204261,-0.143998,
2023-01-05,-0.400983,1.382989,-0.007788,0.030533,
2023-01-06,0.013255,0.520081,1.725528,-1.667882,


In [98]:
df.loc['2023-01-02', ['E']] = 2
df

Unnamed: 0,A,B,C,D,E
2023-01-01,-0.955547,0.298039,-1.146234,0.758017,
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724,2.0
2023-01-03,0.081839,2.604461,1.393615,-0.153449,
2023-01-04,-0.828507,-1.883435,-1.204261,-0.143998,
2023-01-05,-0.400983,1.382989,-0.007788,0.030533,
2023-01-06,0.013255,0.520081,1.725528,-1.667882,


In [99]:
df['F'] = df['A'] + df['B']
df

Unnamed: 0,A,B,C,D,E,F
2023-01-01,-0.955547,0.298039,-1.146234,0.758017,,-0.657507
2023-01-02,-0.345336,-0.86525,-1.887181,0.295724,2.0,-1.210586
2023-01-03,0.081839,2.604461,1.393615,-0.153449,,2.6863
2023-01-04,-0.828507,-1.883435,-1.204261,-0.143998,,-2.711942
2023-01-05,-0.400983,1.382989,-0.007788,0.030533,,0.982006
2023-01-06,0.013255,0.520081,1.725528,-1.667882,,0.533336
