#### 1.1 Pandas 데이터 타입

Pandas에서는 크게 2가지 데이터 타입이 존재한다.
- 1차원 데이터 : Series
- **2차원 데이터 : DataFrame**

---

##### [ 2차원 데이터 ]
→ **DataFrame**를 이용한다.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = [[10,20],[30,40],[50,60],[70,80]] # 리스트의 리스트
df = pd.DataFrame(data, columns=['kor','eng'],
                 index=['aa','bb','cc','dd'])
df

Unnamed: 0,kor,eng
aa,10,20
bb,30,40
cc,50,60
dd,70,80


In [4]:
data = [{'kor':10,'eng':20},
        {'kor':30,'eng':40},
        {'kor':50,'eng':60},
        {'kor':70,'eng':80},
       ]
df = pd.DataFrame(data)
df

Unnamed: 0,eng,kor
0,20,10
1,40,30
2,60,50
3,80,70


In [6]:
data = {'kor':[10,20,30,40],'eng':[20,40,60,80]}
df = pd.DataFrame(data)
df

Unnamed: 0,kor,eng
0,10,20
1,20,40
2,30,60
3,40,80


In [114]:
data = {'kor':[11,26,37,48],
        'eng':[23,50,64,82],
        'math':[61,90,34,53]}
df = pd.DataFrame(data,index=['aa','bb','cc','dd'])
df

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90
cc,37,64,34
dd,48,82,53


### DataFrame의 기본 성질

In [8]:
df.ndim # 2차원

2

In [9]:
df.shape # 4행 2열 행열

(4, 2)

In [11]:
df.size # 4*2 = 8 개

8

In [12]:
df.columns

Index(['kor', 'eng'], dtype='object')

In [13]:
df.index

Index(['aa', 'bb', 'cc', 'dd'], dtype='object')

In [15]:
df.values # numpy 2차원 array

array([[10, 20],
       [20, 40],
       [30, 60],
       [40, 80]])

### index slicing

In [17]:
#df['aa'] # key error...column을 줘야한다
df['kor'] # 결과값은 series

aa    10
bb    20
cc    30
dd    40
Name: kor, dtype: int64

In [18]:
df.kor

aa    10
bb    20
cc    30
dd    40
Name: kor, dtype: int64

In [21]:
df[['kor','math']] # 복수개의 컬럼들을 선택할 경우...결과는 아직 데이터 프레임

Unnamed: 0,kor,math
aa,10,60
bb,20,50
cc,30,34
dd,40,50


In [23]:
df[0:3] # 0만 했으면 에러..하지만 이렇게 하면 slicing은 row가 선택

Unnamed: 0,kor,eng,math
aa,10,20,60
bb,20,40,50
cc,30,60,34


In [24]:
df[1:]

Unnamed: 0,kor,eng,math
bb,20,40,50
cc,30,60,34
dd,40,80,50


In [25]:
df['aa':'cc'] # slicing 은 row로 선택

Unnamed: 0,kor,eng,math
aa,10,20,60
bb,20,40,50
cc,30,60,34


In [26]:
df.iloc[0] # zero base index로 숫자를 넣어야함 결과는 series를 반환해줌

kor     10
eng     20
math    60
Name: aa, dtype: int64

In [27]:
df.loc['aa'] # 부여된 index로

kor     10
eng     20
math    60
Name: aa, dtype: int64

예전 `ix`와 같은 문법은 이제 `loc`과 `iloc`으로 통합되었음

In [28]:
df.iloc[1:4]

Unnamed: 0,kor,eng,math
bb,20,40,50
cc,30,60,34
dd,40,80,50


In [29]:
df.loc['bb':'cc']

Unnamed: 0,kor,eng,math
bb,20,40,50
cc,30,60,34


In [31]:
df.iloc[1:,0] # kor 컬럼만 가져오게 됨

bb    20
cc    30
dd    40
Name: kor, dtype: int64

In [32]:
df.iloc[1:,0:2] # 항상 row먼저 선택하고 그 다음에 col을 선택하

Unnamed: 0,kor,eng
bb,20,40
cc,30,60
dd,40,80


In [33]:
df.iloc[1:,0::2] # kor와 math만 선택하고 싶다면

Unnamed: 0,kor,math
bb,20,50
cc,30,34
dd,40,50


In [34]:
df.iloc[1:,[0,2]] # 특정 컬럼을 선택하고 싶다면 list로 묶는다

Unnamed: 0,kor,math
bb,20,50
cc,30,34
dd,40,50


In [39]:
df.iloc[3,2]

50

In [36]:
df.loc['bb':,] # loc은 부여된 인덱스와 컬럼

Unnamed: 0,kor,eng,math
bb,20,40,50
cc,30,60,34
dd,40,80,50


In [37]:
df.loc[['bb','cc'],['kor','math']]

Unnamed: 0,kor,math
bb,20,50
cc,30,34


In [44]:
df1 = df+2 # 전체 개별 데이터에 2를 더해줌
df1

Unnamed: 0,kor,eng,math
aa,12,22,62
bb,22,42,52
cc,32,62,36
dd,42,82,52


In [45]:
df2 = df['kor']+2 # return 타입이 series
df2

aa    12
bb    22
cc    32
dd    42
Name: kor, dtype: int64

In [51]:
df1['kor'] = df['kor']+2
df1

Unnamed: 0,kor,eng,math
aa,12,22,62
bb,22,42,52
cc,32,62,36
dd,42,82,52


In [52]:
df1['kor'] = [1,2,3,4]
df1

Unnamed: 0,kor,eng,math
aa,1,22,62
bb,2,42,52
cc,3,62,36
dd,4,82,52


In [55]:
df1 = df
df1.iloc[0:2,1:] = 3
df1

Unnamed: 0,kor,eng,math
aa,10,3,3
bb,20,3,3
cc,30,60,34
dd,40,80,50


In [56]:
df[[True,False,False,True]] # row boolean indexing

Unnamed: 0,kor,eng,math
aa,10,3,3
dd,40,80,50


In [59]:
df['kor']>20 # return 값도 row에 대해서 True,False로

aa    False
bb    False
cc     True
dd     True
Name: kor, dtype: bool

In [60]:
df[df['kor']>20]

Unnamed: 0,kor,eng,math
cc,30,60,34
dd,40,80,50


In [64]:
df[(df['kor']==10)|(df['kor']==30)]['kor']

aa    10
cc    30
Name: kor, dtype: int64

In [67]:
df[(df['kor']>=10)&(df['kor']<=30)]

Unnamed: 0,kor,eng,math
aa,10,3,3
bb,20,3,3
cc,30,60,34


In [69]:
df.query('kor>20')# == df[df.kor>=20]

Unnamed: 0,kor,eng,math
cc,30,60,34
dd,40,80,50


In [70]:
df.query('kor == 20 or kor == 10') # query안에는 python 기본문법과 동일하다... 그래서 or을 쓴거임

Unnamed: 0,kor,eng,math
aa,10,3,3
bb,20,3,3


In [71]:
df.query('kor>=20 and kor<40')

Unnamed: 0,kor,eng,math
bb,20,3,3
cc,30,60,34


In [73]:
df

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90
cc,37,64,34
dd,48,82,53


In [77]:
df['sci'] = [58,34,67,78] # column이 없으면 추가, 있으면 수정
df

Unnamed: 0,kor,eng,math,sci
aa,11,23,61,58
bb,26,50,90,34
cc,37,64,34,67
dd,48,82,53,78


In [81]:
df1 = df+2
df1

Unnamed: 0,kor,eng,math,sci
aa,13,25,63,60
bb,28,52,92,36
cc,39,66,36,69
dd,50,84,55,80


In [83]:
df['sum'] = df['kor'] + df['eng'] # 각 개별값에 대해서 더해짐
df

Unnamed: 0,kor,eng,math,sci,sum
aa,11,23,61,58,34
bb,26,50,90,34,76
cc,37,64,34,67,101
dd,48,82,53,78,130


In [86]:
df.loc['dd'] = [67,87,45,34,45]
df

Unnamed: 0,kor,eng,math,sci,sum
aa,11,23,61,58,34
bb,26,50,90,34,76
cc,37,64,34,67,101
dd,67,87,45,34,45


In [88]:
df.loc['ee'] = [67,78,47,76,56] # 없으면 추가
df

Unnamed: 0,kor,eng,math,sci,sum
aa,11,23,61,58,34
bb,26,50,90,34,76
cc,37,64,34,67,101
dd,67,87,45,34,45
ee,67,78,47,76,56


In [90]:
# df[['sum','sci']]
df.loc[0:, 'avg'] = [1,2,3,4,5]
df

Unnamed: 0,kor,eng,math,sci,sum,avg
aa,11,23,61,58,34,1
bb,26,50,90,34,76,2
cc,37,64,34,67,101,3
dd,67,87,45,34,45,4
ee,67,78,47,76,56,5


In [93]:
df = df[['kor','math']] # 이게 사실상 삭제
df

Unnamed: 0,kor,math
aa,11,61
bb,26,90
cc,37,34
dd,67,45
ee,67,47


In [96]:
df[['math','kor']] # column의 순서를 바꾼겨

Unnamed: 0,math,kor
aa,61,11
bb,90,26
cc,34,37
dd,45,67
ee,47,67


In [98]:
df

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90
cc,37,64,34
dd,48,82,53


In [99]:
# Pandas의 drop
df = df.drop('bb') # 완전히 삭제된건 아님...df로 받아주자

Unnamed: 0,kor,eng,math
aa,11,23,61
cc,37,64,34
dd,48,82,53


In [103]:
df.drop('dd',inplace=True)
df

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90


In [105]:
df.drop(['aa','bb']) # 복수개의 컬럼 선택시

Unnamed: 0,kor,eng,math
cc,37,64,34
dd,48,82,53


In [111]:
df.drop('kor',axis = 1) # df.drop('kor',axis = 'column') 이라고 해야함

Unnamed: 0,eng,math
aa,23,61
bb,50,90
cc,64,34
dd,82,53


In [112]:
df.drop(['kor','eng'],axis = 1)

Unnamed: 0,math
aa,61
bb,90
cc,34
dd,53


In [115]:
df.loc['bb':, 'aaa'] = [2,3,4]
df.loc['cc':, 'bbb'] = [2,3]
df

Unnamed: 0,kor,eng,math,aaa,bbb
aa,11,23,61,,
bb,26,50,90,2.0,
cc,37,64,34,3.0,2.0
dd,48,82,53,4.0,3.0


In [116]:
df1 = df
df1.dropna() # NaN이 존재하는 row를 삭제

Unnamed: 0,kor,eng,math,aaa,bbb
cc,37,64,34,3.0,2.0
dd,48,82,53,4.0,3.0


In [118]:
df1.dropna(axis=1) # NaN이 있는 column 삭제

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90
cc,37,64,34
dd,48,82,53


In [119]:
df1.dropna(subset=['aaa']) # aaa에서 nan이 들어간 row만 삭제

Unnamed: 0,kor,eng,math,aaa,bbb
bb,26,50,90,2.0,
cc,37,64,34,3.0,2.0
dd,48,82,53,4.0,3.0


In [120]:
df1.dropna(subset=['aaa','bbb']) # aaa,bbb에서 nan 있는 row 삭제

Unnamed: 0,kor,eng,math,aaa,bbb
cc,37,64,34,3.0,2.0
dd,48,82,53,4.0,3.0


In [121]:
df1.fillna(0) # nan을 0으로 바꿔라

Unnamed: 0,kor,eng,math,aaa,bbb
aa,11,23,61,0.0,0.0
bb,26,50,90,2.0,0.0
cc,37,64,34,3.0,2.0
dd,48,82,53,4.0,3.0


In [122]:
df1['aaa'].fillna(0, inplace=True) # aaa 컬럼에 대해서만 0으로 채워줌
df1

Unnamed: 0,kor,eng,math,aaa,bbb
aa,11,23,61,0.0,
bb,26,50,90,2.0,
cc,37,64,34,3.0,2.0
dd,48,82,53,4.0,3.0


In [130]:
df.sort_values(by = 'kor') # 국어점수(kor) 기준으로 정렬해라

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90
cc,37,64,34
dd,48,82,53


In [131]:
df.sort_values(by = 'eng',ascending=False)

Unnamed: 0,kor,eng,math
dd,48,82,53
cc,37,64,34
bb,26,50,90
aa,11,23,61


In [132]:
df.sort_values(by = ['eng','math']) # 동률인 경우가 있는 경우엔 기준을 두가지로 한다

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90
cc,37,64,34
dd,48,82,53


---

### DataFrame

In [135]:
df

Unnamed: 0,kor,eng,math
aa,11,23,61
bb,26,50,90
cc,37,64,34
dd,48,82,53


In [133]:
df.max() # 각 column별로 최대값, axis = 0으로 기본

kor     48
eng     82
math    90
dtype: int64

In [134]:
df.max(axis = 1) # row에서 컬럼이 가장 큰 값 ex) 무슨 과목이 시험 점수가 젤 높니

aa    61
bb    90
cc    64
dd    82
dtype: int64

In [136]:
df.min()

kor     11
eng     23
math    34
dtype: int64

In [137]:
df.min(axis=1)

aa    11
bb    26
cc    34
dd    48
dtype: int64

In [139]:
# 전체 데이터에서 가장 큰 값
df.max().max()

90

In [140]:
df.sum() # 각 과목의 총합

kor     122
eng     219
math    238
dtype: int64

In [141]:
df.sum(axis=1)

aa     95
bb    166
cc    135
dd    183
dtype: int64

In [142]:
df['sum']=df.sum(axis=1)
df

Unnamed: 0,kor,eng,math,sum
aa,11,23,61,95
bb,26,50,90,166
cc,37,64,34,135
dd,48,82,53,183


In [143]:
df.sum().sum()

1158

In [144]:
df.mean()

kor      30.50
eng      54.75
math     59.50
sum     144.75
dtype: float64

In [145]:
df.mean(axis=1)

aa    47.5
bb    83.0
cc    67.5
dd    91.5
dtype: float64

In [146]:
df.std()

kor     15.800844
eng     24.891431
math    23.273733
sum     38.664152
dtype: float64

In [148]:
df.std(axis=1)

aa    38.170669
bb    61.307966
cc    46.978719
dd    62.814542
dtype: float64

In [149]:
df.median()

kor      31.5
eng      57.0
math     57.0
sum     150.5
dtype: float64

In [151]:
df.quantile()

kor      31.5
eng      57.0
math     57.0
sum     150.5
Name: 0.5, dtype: float64

In [154]:
df.quantile([0.25,0.5,0.75])

Unnamed: 0,kor,eng,math,sum
0.25,22.25,43.25,48.25,125.0
0.5,31.5,57.0,57.0,150.5
0.75,39.75,68.5,68.25,170.25


In [155]:
df.describe()

Unnamed: 0,kor,eng,math,sum
count,4.0,4.0,4.0,4.0
mean,30.5,54.75,59.5,144.75
std,15.800844,24.891431,23.273733,38.664152
min,11.0,23.0,34.0,95.0
25%,22.25,43.25,48.25,125.0
50%,31.5,57.0,57.0,150.5
75%,39.75,68.5,68.25,170.25
max,48.0,82.0,90.0,183.0
