In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### column 설명
* survivied: 생존여부 (1: 생존, 0: 사망)
* pclass: 좌석 등급 (1등급, 2등급, 3등급)
* sex: 성별
* age: 나이
* sibsp: 형제 + 배우자 수
* parch: 부모 + 자녀 수
* fare: 좌석 요금
* embarked: 탑승 항구 (S, C, Q)
* class: pclass와 동일
* who: 남자(man), 여자(woman), 아이(child)
* adult_male: 성인 남자 여부
* deck: 데크 번호 (알파벳 + 숫자 혼용)
* embark_town: 탑승 항구 이름
* alive: 생존여부 (yes, no)
* alone: 혼자 탑승 여부

# 1) 데이터 확인

### head() / tail()

In [3]:
# head() 앞 부분 / tail() 뒷 부분 조회
# default 옵션 값으로 5개의 행이 조회됩니다.
# 괄호 안에 숫자를 넣어 명시적으로 조회하고 싶은 행의 갯수를 지정할 수 있습니다.
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [5]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [6]:
df.tail(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


### info()

In [7]:
# 컬럼별 정보(information)를 보여줍니다.
# 데이터의 갯수, 그리고 데이터 타입(dtype)을 확인할 때 사용합니다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


### value_count()

In [8]:
# column 별 값의 분포를 확인할 때 사용합니다.
df['who'].value_counts()

who
man      537
woman    271
child     83
Name: count, dtype: int64

In [9]:
df['who'].value_counts(ascending=True)  # 오름차순정렬

who
child     83
woman    271
man      537
Name: count, dtype: int64

In [10]:
df['who'].value_counts(normalize=True)  # 비율구하기 

who
man      0.602694
woman    0.304153
child    0.093154
Name: proportion, dtype: float64

In [11]:
df['age'].value_counts()

age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [12]:
df['age'].value_counts(bins=3, sort=False)  # 연속형 자료를 이산형 자료로 변환 후 개수 확인. 3개 구간으로 범주화. 
# '()'는 미포함, '[]'는 포함.

(0.339, 26.947]     319
(26.947, 53.473]    345
(53.473, 80.0]       50
Name: count, dtype: int64

In [13]:
df['age'].value_counts(bins=[0, 20, 40, 60, 100], sort=False)  # 연속형 자료를 사용자 지정 구간으로 Binning 

(-0.001, 20.0]    179
(20.0, 40.0]      385
(40.0, 60.0]      128
(60.0, 100.0]      22
Name: count, dtype: int64

## 연습문제

embark_town은 승객의 탑승 항구를 나타내는 column 입니다. 탑승 항구별 승객 데이터 분포를 확인해 주세요.

# 2) 속성: Attributes

속성 값은 함수형으로 조회하지 않습니다. 즉, 뒤에 소괄호를 붙이지 않습니다. 

### ndim

In [14]:
df.ndim  # 차원을 나타냅니다. DataFrame은 2가 출력됩니다.

2

### shape

In [15]:
df.shape  # (행, 열) 순서로 출력됩니다.

(891, 15)

### index

In [16]:
df.index  # index는 기본 설정된 RangeIndex가 출력됩니다.

RangeIndex(start=0, stop=891, step=1)

### columns

In [17]:
df.columns # columns는 열을 출력 합니다.

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

### values

In [18]:
df.values  # values는 모든 값을 출력하며, numpy array 형식으로 출력됩니다.

array([[0, 3, 'male', ..., 'Southampton', 'no', False],
       [1, 1, 'female', ..., 'Cherbourg', 'yes', False],
       [1, 3, 'female', ..., 'Southampton', 'yes', True],
       ...,
       [0, 3, 'female', ..., 'Southampton', 'no', False],
       [1, 1, 'male', ..., 'Cherbourg', 'yes', True],
       [0, 3, 'male', ..., 'Queenstown', 'no', True]], dtype=object)

### T

In [19]:
df.T  # T: 전치 (Transpose) 는 Index와 Column의 축을 교환합니다.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,881,882,883,884,885,886,887,888,889,890
survived,0,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,1,0
pclass,3,1,3,1,3,3,1,3,3,2,...,3,3,2,3,3,2,1,3,1,3
sex,male,female,female,female,male,male,male,male,female,female,...,male,female,male,male,female,male,female,female,male,male
age,22.0,38.0,26.0,35.0,35.0,,54.0,2.0,27.0,14.0,...,33.0,22.0,28.0,25.0,39.0,27.0,19.0,,26.0,32.0
sibsp,1,1,0,1,0,0,0,3,0,1,...,0,0,0,0,0,0,0,1,0,0
parch,0,0,0,0,0,0,0,1,2,0,...,0,0,0,0,5,0,0,2,0,0
fare,7.25,71.2833,7.925,53.1,8.05,8.4583,51.8625,21.075,11.1333,30.0708,...,7.8958,10.5167,10.5,7.05,29.125,13.0,30.0,23.45,30.0,7.75
embarked,S,C,S,S,S,Q,S,S,S,C,...,S,S,S,S,Q,S,S,S,C,Q
class,Third,First,Third,First,Third,Third,First,Third,Third,Second,...,Third,Third,Second,Third,Third,Second,First,Third,First,Third
who,man,woman,woman,woman,man,man,man,child,woman,child,...,man,woman,man,man,woman,man,woman,woman,man,man


# 3) 타입 변환 (astype)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [21]:
# int 32로 변환
df['pclass'].astype('int32').head()

0    3
1    1
2    3
3    1
4    3
Name: pclass, dtype: int32

각 자료형 별 지정할 수 있는 숫자의 범위 
* Int8 : -128 ~ 127  
* Int16 : -32768 ~ 32767  
* Int32 : -2147483648 ~ 2147483647  
* Int64 : -9223372036854775808 ~ 9223372036854775807  

In [22]:
# float 32로 변환 (int는 정수, float은 실수.)
df['pclass'].astype('float32').head()

0    3.0
1    1.0
2    3.0
3    1.0
4    3.0
Name: pclass, dtype: float32

In [23]:
# str(문자형)으로 변환. 
df['pclass'].astype('str').head()

0    3
1    1
2    3
3    1
4    3
Name: pclass, dtype: object

In [24]:
# category로 변환. 
df['pclass'].astype('category').head()  # category로 변경시에는 Categories가 같이 출력 됩니다.

0    3
1    1
2    3
3    1
4    3
Name: pclass, dtype: category
Categories (3, int64): [1, 2, 3]

# 4) 정렬 (sort)

### sort_index

- sort_index: index 정렬

- index 기준으로 정렬합니다. (기본 오름차순이 적용되어 있습니다.
- 내림차순 정렬을 적용하려면, ascending=False를 옵션 값으로 설정합니다.

In [25]:
df.sort_index().head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [26]:
df.sort_index(ascending=False).head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True


### sort_values

- sort_values: 값에 대한 정렬
- 값을 기준으로 행을 정렬합니다.
- by에 기준이 되는 행을 설정합니다.
- by에 2개 이상의 컬럼을 지정하여 정렬할 수 있습니다.
- 오름차순/내림차순을 컬럼 별로 지정할 수 있습니다.

In [27]:
df.sort_values(by='age').head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False
755,1,2,male,0.67,1,1,14.5,S,Second,child,False,,Southampton,yes,False
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False


In [28]:
df.sort_values(by='age', ascending=False).head()  # 내림차순 정렬: ascending=False

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [29]:
df.sort_values(by='class', ascending=False).head()  # 문자열 컬럼도 오름차순/내림차순 정렬이 가능하며 알파벳 순서로 정렬됩니다.

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
511,0,3,male,,0,0,8.05,S,Third,man,True,,Southampton,no,True
500,0,3,male,17.0,0,0,8.6625,S,Third,man,True,,Southampton,no,True
501,0,3,female,21.0,0,0,7.75,Q,Third,woman,False,,Queenstown,no,True
502,0,3,female,,0,0,7.6292,Q,Third,woman,False,,Queenstown,no,True


In [30]:
df.sort_values(by=['fare', 'age']).head()  # 2개 이상의 컬럼을 기준으로 값 정렬 할 수 있습니다.

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
822,0,1,male,38.0,0,0,0.0,S,First,man,True,,Southampton,no,True
806,0,1,male,39.0,0,0,0.0,S,First,man,True,A,Southampton,no,True


In [31]:
df.sort_values(by=['fare', 'age'], ascending=[False, True]).head()  # 오름차순/내림차순 정렬도 컬럼 각각에 지정해 줄 수 있습니다.

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
88,1,1,female,23.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False


## 연습문제

* tips 데이터는 미국 레스토랑의 매출과 웨이터에게 지불한 팁을 나타내는 데이터입니다.

In [32]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


* 연습문제1. total_bill과 tip에 대한 내림차순 정렬을 해주세요. 상위 10개만 출력하세요.

* 연습문제2. size를 기준으로 내림차순, tip을 기준으로는 오름차순 정렬을 해주세요. 상위 10개의 데이터만 출력하세요.

# 5) Indexing / Slicing / 조건 필터링

In [33]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


###  5-1) loc - indexing / slicing

- indexing과 slicing을 할 수 있습니다.
- slicing은 [시작(포함): 끝(포함)] 규칙에 유의합니다. 둘 다 포함 합니다.

In [34]:
df.loc[0, 'class']  # indexing 예시

'Third'

In [35]:
df.loc[0, ['age', 'fare', 'who']]  # fancy indexing 예시

age     22.0
fare    7.25
who      man
Name: 0, dtype: object

In [36]:
df.loc[2:5, 'class':'deck'].head()  # slicing 예시 (slicing은 [시작(포함): 끝(포함)])

Unnamed: 0,class,who,adult_male,deck
2,Third,woman,False,
3,First,woman,False,C
4,Third,man,True,
5,Third,man,True,


In [37]:
df.loc[:6, 'class':'deck']

Unnamed: 0,class,who,adult_male,deck
0,Third,man,True,
1,First,woman,False,C
2,Third,woman,False,
3,First,woman,False,C
4,Third,man,True,
5,Third,man,True,
6,First,man,True,E


### loc - 조건 필터
* boolean index을 만들어 조건에 맞는 데이터만 추출해 낼 수 있습니다.

In [38]:
cond = (df['age'] >= 70)
cond

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: age, Length: 891, dtype: bool

In [39]:
df.loc[cond]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
672,0,2,male,70.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
745,0,1,male,70.0,1,1,71.0,S,First,man,True,B,Southampton,no,False
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True


### loc - 다중 조건
* 다중 조건은 먼저 condition을 정의하고 & 와 | 연산자로 복합 조건을 생성합니다.

In [40]:
# 조건1 정의
cond1 = (df['fare'] > 30)

# 조건2 정의
cond2 = (df['who'] == 'woman')

In [41]:
df.loc[cond1 & cond2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
25,1,3,female,38.0,1,5,31.3875,S,Third,woman,False,,Southampton,yes,False
31,1,1,female,,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
52,1,1,female,49.0,1,0,76.7292,C,First,woman,False,D,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,1,1,female,16.0,0,1,39.4000,S,First,woman,False,D,Southampton,yes,False
856,1,1,female,45.0,1,1,164.8667,S,First,woman,False,,Southampton,yes,False
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False


In [42]:
df.loc[cond1 | cond2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


### loc 조건필터 후 대입

In [43]:
cond = (df['age'] >= 70)
cond

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: age, Length: 891, dtype: bool

In [44]:
# 조건 필터
df.loc[cond]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
672,0,2,male,70.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
745,0,1,male,70.0,1,1,71.0,S,First,man,True,B,Southampton,no,False
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True


In [45]:
df.loc[cond, 'age']  # 나이 컬럼만 가져옵니다.

96     71.0
116    70.5
493    71.0
630    80.0
672    70.0
745    70.0
851    74.0
Name: age, dtype: float64

In [46]:
df.loc[cond, 'age'] = -1  # 조건 필터 후 원하는 값을 대입할 수 있습니다. (단일 컬럼 선택에 유의)

In [47]:
df.loc[cond]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
96,0,1,male,-1.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,-1.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
493,0,1,male,-1.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
630,1,1,male,-1.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
672,0,2,male,-1.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
745,0,1,male,-1.0,1,1,71.0,S,First,man,True,B,Southampton,no,False
851,0,3,male,-1.0,0,0,7.775,S,Third,man,True,,Southampton,no,True


## 5-2) iloc

loc와 유사하지만, index만 허용합니다.
loc와 마찬가지고, indexing / slicing 모두 가능합니다.

In [48]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [49]:
df.iloc[1, 3]  # indexing

38.0

In [50]:
df.iloc[[0, 3, 4], [0, 1, 5, 6]]  # Fancy Indexing

Unnamed: 0,survived,pclass,parch,fare
0,0,3,0,7.25
3,1,1,0,53.1
4,0,3,0,8.05


In [51]:
df.iloc[:3, :5]  # Slicing

Unnamed: 0,survived,pclass,sex,age,sibsp
0,0,3,male,22.0,1
1,1,1,female,38.0,1
2,1,3,female,26.0,0


# 5-3) isin

In [52]:
# 특정 값의 포함 여부는 isin 함수를 통해 비교가 가능합니다. (파이썬의 in 키워드는 사용 불가 합니다.)

In [53]:
sample = pd.DataFrame({'name': ['kim', 'lee', 'park', 'choi'], 
                        'age': [24, 27, 34, 19]
                      })
sample

Unnamed: 0,name,age
0,kim,24
1,lee,27
2,park,34
3,choi,19


In [54]:
sample['name'].isin(['kim', 'lee'])

0     True
1     True
2    False
3    False
Name: name, dtype: bool

In [55]:
sample.isin(['kim', 'lee'])

Unnamed: 0,name,age
0,True,False
1,True,False
2,False,False
3,False,False


In [56]:
condition = sample['name'].isin(['kim', 'lee'])  # loc를 활용한 조건 필터링으로도 찰떡궁합입니다.

In [57]:
sample.loc[condition]

Unnamed: 0,name,age
0,kim,24
1,lee,27


## 연습문제

In [58]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


* tips 데이터셋 중 day가 금요일(Fri), 토요일(Sat) 만 필터링 합니다.
* tip이 $10보다 적게 낸 데이터만 필터링합니다.
* 컬럼은 total_bill, tip, smoker, time만 출력합니다.
* 상위 10개 행만 출력합니다.