# pandas
- pandas 는 데이터 분석과 조작(전처리)을 위한 오픈소스 라이브러리
- Series 와 Datsframe 이라는 데이터 타입을 이용
- 데이터 필터링, 정렬, 그룹화, 결측처리, 시각화등


- Pandas 데이터 타입
    - series
        - index, value 로 이루어진 데이터 타입
    - DataFrame
        - indwx, values, colums으로 이루어진 데이터 타입
        - Series 데이터가 모이면 DataFrame
        - tabular, table

- 파이썬에서 쓰는 엑셀
- 스테로이드 맞은 엑셀

## 01. Series
- index, values

In [10]:
# !pip install pandas

In [11]:
import numpy as np
import pandas as pd

In [12]:
# 선언
series = pd.Series([1, 2, 3, 4])

In [13]:
type(series)

pandas.core.series.Series

In [14]:
# 오프셋 인덱스
series[0], series[3], series[1:3], type(series[2])

(1,
 4,
 1    2
 2    3
 dtype: int64,
 numpy.int64)

In [15]:
# 시리즈 내부 데이터 타입
pd.Series([1, 2, 3], dtype = np.float64)

0    1.0
1    2.0
2    3.0
dtype: float64

In [16]:
# 한 가지 데이터 타입만 인식
# object는 문자열 데이터를 의미
li = [1, 2, 3,'4']                # 첫째열 인덱스 둘째열 values
pd.Series(li)

0    1
1    2
2    3
3    4
dtype: object

In [17]:
series.index, series.values

(RangeIndex(start=0, stop=4, step=1), array([1, 2, 3, 4]))

In [18]:
# 형 변환
series.astype(np.float64)

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [19]:
series

0    1
1    2
2    3
3    4
dtype: int64

In [20]:
# index 설정
pd.Series(np.random.randint(10, size = 5), list('ABCDE'))

A    6
B    3
C    9
D    6
E    2
dtype: int64

In [21]:
pd.Series(np.random.randint(10, size = 5))

0    1
1    4
2    1
3    7
4    0
dtype: int64

In [22]:
# index 설정
data = pd.Series(np.random.randint(10, size = 5), list('ABCDE'))

In [23]:
data['A'], data.A, data[['A','B']]

(1,
 1,
 A    1
 B    3
 dtype: int64)

In [24]:
data[data > 5] = 999

In [25]:
# 브로드 캐스팅 연산
data*100

A      100
B      300
C        0
D    99900
E    99900
dtype: int64

## 02. DataFrame
- Series()
    - index, values
- DataFrame()
    - index, values, columns

In [26]:
dates = pd.date_range('20230804', periods = 6)

In [88]:
df = pd.DataFrame(data = np.random.randn(6,4), 
                  index = dates,
                  columns = ['A','B','C','D']
            )     # 맨위 오른쪽 컬럼

In [89]:
df

Unnamed: 0,A,B,C,D
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239
2023-08-06,-1.829692,0.19867,0.131028,-0.411709
2023-08-07,0.367424,0.392683,-0.277288,0.435405
2023-08-08,-1.088865,-1.456308,-0.386265,-0.502584
2023-08-09,1.423846,-0.304003,0.270196,-0.085229


In [90]:
# 데이터 프레임 상단 정보 확인
df.head(5) # 디폴트 값은 5

Unnamed: 0,A,B,C,D
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239
2023-08-06,-1.829692,0.19867,0.131028,-0.411709
2023-08-07,0.367424,0.392683,-0.277288,0.435405
2023-08-08,-1.088865,-1.456308,-0.386265,-0.502584


In [91]:
# 데이터 프레임 하단 정보 확인
df.tail(1)

Unnamed: 0,A,B,C,D
2023-08-09,1.423846,-0.304003,0.270196,-0.085229


In [92]:
# 데이터 프레임 기본 정보 요약
df. info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2023-08-04 to 2023-08-09
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 412.0 bytes


In [93]:
# 데이터 프레임 기술 통계 정보 요약
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.647509,-0.085369,-0.07453,-0.602719
std,1.285462,0.90186,0.573916,0.954454
min,-1.829692,-1.456308,-0.913057,-2.379809
25%,-1.586653,-0.476616,-0.359021,-0.629939
50%,-1.047023,-0.052666,-0.07313,-0.457147
75%,0.024273,0.344179,0.235404,-0.166849
max,1.423846,1.190899,0.728209,0.435405


In [94]:
# 데이터 프레임 구성 3요소
df.index, df.columns, df.values

(DatetimeIndex(['2023-08-04', '2023-08-05', '2023-08-06', '2023-08-07',
                '2023-08-08', '2023-08-09'],
               dtype='datetime64[ns]', freq='D'),
 Index(['A', 'B', 'C', 'D'], dtype='object'),
 array([[-1.75258234, -0.53415391,  0.72820882, -2.37980884],
        [-1.00518205,  1.19089947, -0.91305725, -0.67239001],
        [-1.82969224,  0.19866996,  0.13102843, -0.41170948],
        [ 0.36742438,  0.3926825 , -0.2772881 ,  0.43540462],
        [-1.0888649 , -1.45630847, -0.38626489, -0.50258408],
        [ 1.42384553, -0.30400251,  0.2701956 , -0.08522918]]))

In [95]:
df.A #데이터 타입 시리즈   시리즈가 모여있는게 데이터 프레임

2023-08-04   -1.752582
2023-08-05   -1.005182
2023-08-06   -1.829692
2023-08-07    0.367424
2023-08-08   -1.088865
2023-08-09    1.423846
Freq: D, Name: A, dtype: float64

In [96]:
df[['B', 'D']]

Unnamed: 0,B,D
2023-08-04,-0.534154,-2.379809
2023-08-05,1.190899,-0.67239
2023-08-06,0.19867,-0.411709
2023-08-07,0.392683,0.435405
2023-08-08,-1.456308,-0.502584
2023-08-09,-0.304003,-0.085229


In [97]:
# 오프셋 인덱스
# [n:m]
# n부터 m-1

df[0:2]

Unnamed: 0,A,B,C,D
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239


In [98]:
df['2023-08-04':'2023-08-06']

Unnamed: 0,A,B,C,D
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239
2023-08-06,-1.829692,0.19867,0.131028,-0.411709


In [99]:
# location
# loc

df.loc['2023-08-04':'2023-08-06', 'A':'C']

Unnamed: 0,A,B,C
2023-08-04,-1.752582,-0.534154,0.728209
2023-08-05,-1.005182,1.190899,-0.913057
2023-08-06,-1.829692,0.19867,0.131028


In [100]:
# int location
# iloc
df.iloc[0:2]

Unnamed: 0,A,B,C,D
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239


In [101]:
df

Unnamed: 0,A,B,C,D
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239
2023-08-06,-1.829692,0.19867,0.131028,-0.411709
2023-08-07,0.367424,0.392683,-0.277288,0.435405
2023-08-08,-1.088865,-1.456308,-0.386265,-0.502584
2023-08-09,1.423846,-0.304003,0.270196,-0.085229


In [102]:
# 데이터 정렬
# sort_values()

df.sort_values(by = 'A', ascending = False) # True 오름차순, False 내림차순

Unnamed: 0,A,B,C,D
2023-08-09,1.423846,-0.304003,0.270196,-0.085229
2023-08-07,0.367424,0.392683,-0.277288,0.435405
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239
2023-08-08,-1.088865,-1.456308,-0.386265,-0.502584
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-06,-1.829692,0.19867,0.131028,-0.411709


In [103]:
df.sort_values(by = ['A','B'], ascending = False)

Unnamed: 0,A,B,C,D
2023-08-09,1.423846,-0.304003,0.270196,-0.085229
2023-08-07,0.367424,0.392683,-0.277288,0.435405
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239
2023-08-08,-1.088865,-1.456308,-0.386265,-0.502584
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809
2023-08-06,-1.829692,0.19867,0.131028,-0.411709


In [104]:
# 데이터를 딕셔너리의 리스트로 생성
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [3, 2, 3, 1, 2],
    'C': [5, 4, 3 ,2, 1]
}

In [105]:
sort_study_df = pd.DataFrame(data)

In [106]:
# 데이터를 리스트의 딕셔너리로 생성
data = [
    {'A':1, 'B':3, 'C':5},
    {'A':1, 'B':3, 'C':5},
    {'A':1, 'B':3, 'C':5},
    {'A':1, 'B':3, 'C':5},
    {'A':1, 'B':3, 'C':5},
    {'A':1, 'B':3, 'C':5}
]

sort_study_df = pd.DataFrame(data)
    

In [107]:
sort_study_df.sort_values(by = ['B','C'])

Unnamed: 0,A,B,C
0,1,3,5
1,1,3,5
2,1,3,5
3,1,3,5
4,1,3,5
5,1,3,5


In [108]:
# 컬럼 추가
# 기존에 컬럼이 없으면 추가, 있으면 덮어쓰기
df['E'] = ['one', 'one', 'two', 'three', 'four', 'seven']

In [109]:
df

Unnamed: 0,A,B,C,D,E
2023-08-04,-1.752582,-0.534154,0.728209,-2.379809,one
2023-08-05,-1.005182,1.190899,-0.913057,-0.67239,one
2023-08-06,-1.829692,0.19867,0.131028,-0.411709,two
2023-08-07,0.367424,0.392683,-0.277288,0.435405,three
2023-08-08,-1.088865,-1.456308,-0.386265,-0.502584,four
2023-08-09,1.423846,-0.304003,0.270196,-0.085229,seven


In [110]:
# 데이터 삭제
# axis = 세로 axis = 0 가로
# in-palce 내부 내용을 바꾼다
df.drop('A', axis = 1, inplace=True) # 1은 세로를 지워준다


In [111]:
df.drop(['2023-08-06', '2023-08-09'], axis = 0)

Unnamed: 0,B,C,D,E
2023-08-04,-0.534154,0.728209,-2.379809,one
2023-08-05,1.190899,-0.913057,-0.67239,one
2023-08-07,0.392683,-0.277288,0.435405,three
2023-08-08,-1.456308,-0.386265,-0.502584,four


In [112]:
df.drop(columns=['B', 'D'])

Unnamed: 0,C,E
2023-08-04,0.728209,one
2023-08-05,-0.913057,one
2023-08-06,0.131028,two
2023-08-07,-0.277288,three
2023-08-08,-0.386265,four
2023-08-09,0.270196,seven


In [113]:
df.drop(index=['2023-08-04'])

Unnamed: 0,B,C,D,E
2023-08-05,1.190899,-0.913057,-0.67239,one
2023-08-06,0.19867,0.131028,-0.411709,two
2023-08-07,0.392683,-0.277288,0.435405,three
2023-08-08,-1.456308,-0.386265,-0.502584,four
2023-08-09,-0.304003,0.270196,-0.085229,seven


In [114]:
# E 컬럼 삭제후 저장
df.drop('E', axis = 1, inplace = True)

In [119]:
def plus_minus(values):
    return'plus' if values > 0 else 'minus'

In [120]:
df['B'] = df['B'].apply(plus_minus)

TypeError: '>' not supported between instances of 'str' and 'int'

In [121]:
df.loc['2023-08-06', 'C'] = 999

In [122]:
df

Unnamed: 0,B,C,D
2023-08-04,minus,0.728209,-2.379809
2023-08-05,plus,-0.913057,-0.67239
2023-08-06,plus,999.0,-0.411709
2023-08-07,plus,-0.277288,0.435405
2023-08-08,minus,-0.386265,-0.502584
2023-08-09,minus,0.270196,-0.085229


In [123]:
df.iloc

<pandas.core.indexing._iLocIndexer at 0x7f01f9b13560>

In [70]:
# left_df

# key 라는 이름의 컬럼
# key 컬럼의 값으로는 KO,K4,K2,K3

# A라는 이름의 칼럼
# A 컬럼의 값으로는 A0,A1,A2,A3

# B라는 이름의 컬럼
# B컬럼의 값으로는 B0, B1, B2, B3

In [71]:
left_df = pd.DataFrame({'key':[ K0,K1,K2,K3],'A':['A0', 'A1', 'A2','A3'})

SyntaxError: closing parenthesis '}' does not match opening parenthesis '[' (3479752655.py, line 1)

In [177]:
df

Unnamed: 0,B,C,D,c,E
2023-08-04,0.019709,1.766075,0.255431,,one
2023-08-05,-0.788071,1.271392,0.7103,,one
2023-08-06,1.463192,-1.306948,-0.876385,999.0,two
2023-08-07,1.766481,-1.278447,0.86637,,three
2023-08-08,0.312188,0.883246,-1.525695,,four
2023-08-09,-0.971244,0.23969,-0.204332,,seven


In [179]:
df['Key'] = ['KO','K1','K2','K3']

SyntaxError: invalid syntax (3179724749.py, line 1)

In [None]:
# rigth_df

#key 라는 이름의 컬럼
# KO,K1,K2,K3
# C라는 이름의 칼럼
# CO, C1, C2, C3
# D라는 이름의 컬럼
# D0, D1, D2, D3

In [181]:
left_df = pd.DataFrame([
    {'key':'KO','K1','K2','K3}],
    [{'C': 'CO', 'C1', 'C2', 'C3'}],
    [{'D': 'D0', 'D1', 'D2', 'D3'}]
                      )

SyntaxError: unterminated string literal (detected at line 2) (2336847640.py, line 2)

In [None]:
# merge
# 데이터 프레임을 가로로 병합
pd.merge(left_df, Rifht_df, on = 'key', how ='outer'))

In [None]:
# conract
# 세로로병합
# NaN
# Not a Number
# 누락데이터, 누락값
nan_df = pd.daopna()

In [None]:
nan_df = fillna(누락값)

In [None]:
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [3, 2, 3, 1, 2],
    'C': [5, 4, 3 ,2, 1]
}