# DataFrame의 다양한 응용
- 함수 Mapping
- 열 재구성
- Flitering
- 데이터 프레임 합치기 
- 그룹연산
- Multi Index 
- Pivot 

### 함수 Mapping 
- Series 또는 Dataframe의 Data를 특정함수에 일대일 대응 시키는 과정

In [1]:
import seaborn as sns

In [2]:
titanic = sns.load_dataset("titanic")

In [4]:
# titanic 데이터셋에서 age, fare 2개 열을 선택하여 데이터 프레임 만들기 
df = titanic[['age','fare']]

In [6]:
df['ten'] =10
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ten'] =10


Unnamed: 0,age,fare,ten
0,22.0,7.25,10
1,38.0,71.2833,10
2,26.0,7.925,10
3,35.0,53.1,10
4,35.0,8.05,10


In [9]:
# 사용자 함수 정의 
# 10을 더하는 함수 
def add_10(n):
    return n + 10
# a + b 
def add_two_obj(a,b):
    return a + b

In [10]:
print(add_10(10))
print(add_two_obj(10,15))

20
25


In [12]:
# 시리즈 객체에 적용
# 나이에 + 10 
sr1 = df['age'].apply(add_10)
sr1.head()

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

In [13]:
# 시리즈 객체에 숫자 적용
sr2 = df['age'].apply(add_two_obj, b= 10)
# a = add_two_obj 
sr2.head()

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

---
### 람다함수 
- 시리즈 객체에 적용
- for문 같이 복잡한 식이 아닌 간단한 식에 적용 가능

In [15]:
sr3 = df['age'].apply(lambda x: x + 10 )
sr3.head()

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

In [16]:
# 더 많이 쓰는 보편적인 방법 
sr4 = df['age'].apply(lambda x: add_10(x) )
sr4.head()

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

In [17]:
# DataFrame 에 함수 Mapping
# applymap 모든 칼럼에 적용 
df_map = df.applymap(add_10)
df_map.head()

Unnamed: 0,age,fare,ten
0,32.0,17.25,20
1,48.0,81.2833,20
2,36.0,17.925,20
3,45.0,63.1,20
4,45.0,18.05,20


In [18]:
# 사용자 정의 함수 
# nan 값 찾기 
def missing_value(series):
    return series.isnull()

In [19]:
df.applymap(missing_value)

AttributeError: 'float' object has no attribute 'isnull'

In [20]:
# type이 달라 하나씩 찾는 apply 가능 
df.apply(missing_value)

Unnamed: 0,age,fare,ten
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
886,False,False,False
887,False,False,False
888,True,False,False
889,False,False,False


> apply는 column별(series)로 함수를 실행하나, applymap은 dataframe 전체로 함수 실행

In [23]:
# DataFrame의 각 열의 최대값과 최소값의 차이를 출력
# 사용자 정의 
def min_max(series):
    return series.max() - series.min()


In [25]:
result =df.apply(min_max)
print(result)

age      79.5800
fare    512.3292
ten       0.0000
dtype: float64


In [28]:
# lambda 사용 
#df['add'] = df['age'].apply(lambda x: add_10(x) )
df['add'] = df.apply(lambda x: add_two_obj(x['age'],x['ten']),axis =1 )

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['add'] = df.apply(lambda x: add_two_obj(x['age'],x['ten']),axis =1 )


Unnamed: 0,age,fare,ten,add
0,22.0,7.25,10,32.0
1,38.0,71.2833,10,48.0
2,26.0,7.925,10,36.0
3,35.0,53.1,10,45.0
4,35.0,8.05,10,45.0


---
## pipe() 사용하기 

In [29]:
df = titanic[['age','fare']]
df.head()

Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05


In [30]:
# 각 열의 NaN 찾기
def missing_value(x):
    return x.isnull()

In [31]:
# 각 열의 NaN 갯수 변환
def missing_count(x):
    return missing_value(x).sum()

In [32]:
# 총 NaN 의 갯수 
def total_number_missing(x):
    return missing_count(x).sum()

In [33]:
df.pipe(missing_value)

Unnamed: 0,age,fare
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
886,False,False
887,False,False
888,True,False
889,False,False


In [34]:
df.pipe(missing_count)

age     177
fare      0
dtype: int64

In [35]:
df.pipe(total_number_missing)

177

---
# 열 재구성

In [38]:
# titanic 
df = titanic[['survived','pclass','sex','age']]
df.head()

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


In [42]:
# 열 이름의 리스트 만들기
columns = list(df.columns)
columns
type(columns)

list

In [40]:
df.columns

Index(['survived', 'pclass', 'sex', 'age'], dtype='object')

In [45]:
# 열 이름을 알파벳 순으로 정렬
columns_sorted = sorted(columns)
columns_sorted 

['age', 'pclass', 'sex', 'survived']

In [46]:
df[columns_sorted]

Unnamed: 0,age,pclass,sex,survived
0,22.0,3,male,0
1,38.0,1,female,1
2,26.0,3,female,1
3,35.0,1,female,1
4,35.0,3,male,0
...,...,...,...,...
886,27.0,2,male,0
887,19.0,1,female,1
888,,3,female,0
889,26.0,1,male,1


In [49]:
# 열 이름을 기존순서와 정반대 순서로 정렬하기 
columns_sorted2 = reversed(columns)
df[columns_sorted2]

Unnamed: 0,age,sex,pclass,survived
0,22.0,male,3,0
1,38.0,female,1,1
2,26.0,female,3,1
3,35.0,female,1,1
4,35.0,male,3,0
...,...,...,...,...
886,27.0,male,2,0
887,19.0,female,1,1
888,,female,3,0
889,26.0,male,1,1


In [52]:
columns_customed  = ['pclass','sex','age','survived']
columns_customed 

['pclass', 'sex', 'age', 'survived']

In [53]:
df[columns_customed]

Unnamed: 0,pclass,sex,age,survived
0,3,male,22.0,0
1,1,female,38.0,1
2,3,female,26.0,1
3,1,female,35.0,1
4,3,male,35.0,0
...,...,...,...,...
886,2,male,27.0,0
887,1,female,19.0,1
888,3,female,,0
889,1,male,26.0,1


### 열분리 
- 하나의 열이 여러가지 정보를 담고 있을 때 각 정보를 서로 분리하여 사용 

In [54]:
import pandas as pd

In [55]:
df = pd.read_excel("../Data/주가데이터.xlsx")
df.head()

Unnamed: 0,연월일,당일종가,전일종가,시가,고가,저가,거래량
0,2018-07-02,10100,600,10850,10900,10000,137977
1,2018-06-29,10700,300,10550,10900,9990,170253
2,2018-06-28,10400,500,10900,10950,10150,155769
3,2018-06-27,10900,100,10800,11050,10500,133548
4,2018-06-26,10800,350,10900,11000,10700,63039


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   연월일     20 non-null     datetime64[ns]
 1   당일종가    20 non-null     int64         
 2   전일종가    20 non-null     int64         
 3   시가      20 non-null     int64         
 4   고가      20 non-null     int64         
 5   저가      20 non-null     int64         
 6   거래량     20 non-null     int64         
dtypes: datetime64[ns](1), int64(6)
memory usage: 1.2 KB


In [57]:
# 년 , 월 , 일 데이터 분리하기 
df['연월일'] = df['연월일'].astype("str")
dates = df['연월일'].str.split('-')
dates.head()

0    [2018, 07, 02]
1    [2018, 06, 29]
2    [2018, 06, 28]
3    [2018, 06, 27]
4    [2018, 06, 26]
Name: 연월일, dtype: object

In [60]:
# year
dates[0][0]  


'2018'

In [61]:
df['연'] = dates.str.get(0)
df['월'] = dates.str.get(1)
df['일'] = dates.str.get(2)
df.head()

Unnamed: 0,연월일,당일종가,전일종가,시가,고가,저가,거래량,연,월,일
0,2018-07-02,10100,600,10850,10900,10000,137977,2018,7,2
1,2018-06-29,10700,300,10550,10900,9990,170253,2018,6,29
2,2018-06-28,10400,500,10900,10950,10150,155769,2018,6,28
3,2018-06-27,10900,100,10800,11050,10500,133548,2018,6,27
4,2018-06-26,10800,350,10900,11000,10700,63039,2018,6,26


---
### Filering
- 특정 조건식을 만족하는 Data만 따로 추출하는 방법

In [76]:
# 나이가 10대(10~19세)인 승객만 따로 선택
mask1 = (titanic.age >= 10)  & (titanic.age < 20)
df_teenage = titanic.loc[mask1, :]
df_teenage.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
14,0,3,female,14.0,0,0,7.8542,S,Third,child,False,,Southampton,no,True
22,1,3,female,15.0,0,0,8.0292,Q,Third,child,False,,Queenstown,yes,True
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
38,0,3,female,18.0,2,0,18.0,S,Third,woman,False,,Southampton,no,False


In [77]:
# 나이가 10세(0~ 9) 미만이고 여성인 고객만 출력 
mask2 = (titanic.age < 9)  & (titanic.age >=  0)  & (titanic.sex =="female")
df_female_under10 = titanic.loc[mask2, :]
df_female_under10.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
24,0,3,female,8.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
43,1,2,female,3.0,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
58,1,2,female,5.0,1,2,27.75,S,Second,child,False,,Southampton,yes,False
119,0,3,female,2.0,4,2,31.275,S,Third,child,False,,Southampton,no,False


In [80]:
# 나이가 10시 미만( 0 ~9세) 또는 60세 이상의 승객의 age,sex,alone열만 선택 

mask3 = (titanic['age'] < 10) | (titanic['age'] >= 60)
df_under10_moretan60 = titanic[mask3][['age','sex','alone']]
df_under10_moretan60.head()


Unnamed: 0,age,sex,alone
7,2.0,male,False
10,4.0,female,False
16,2.0,male,False
24,8.0,female,False
33,66.0,male,True


### isin()

In [81]:
isin_filter = titanic['sibsp'].isin([3,4,5])
titanic[isin_filter].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
16,0,3,male,2.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
24,0,3,female,8.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
50,0,3,male,7.0,4,1,39.6875,S,Third,child,False,,Southampton,no,False


---
### Data Frame 병합 
- SQL의 Join과 비슷한 방식으로 어떤 기준에 의해 병합하는 개념
- 이때 기준이 되는 열이나 index를 Key라고 한다.
- key가 되는 열이나 index는 반드시 양쪽 DataFrame 에 존재해야 한다.

## 08번 DataFrame 정제 참고

In [88]:
# 주식 데이터를 가져와서 데이터프레임 만들기 
df1 = pd.read_excel("../Data/stock price.xlsx")
df1

Unnamed: 0,id,stock_name,value,price
0,128940,한미약품,59385.666667,421000
1,130960,CJ E&M,58540.666667,98900
2,138250,엔에스쇼핑,14558.666667,13200
3,139480,이마트,239230.833333,254500
4,142280,녹십자엠에스,468.833333,10200
5,145990,삼양사,82750.0,82000
6,185750,종근당,40293.666667,100500
7,192400,쿠쿠홀딩스,179204.666667,177500
8,199800,툴젠,-2514.333333,115400
9,204210,모두투어리츠,3093.333333,3475


- id : 종목코드 
- stock_name : 회사이름
- vale : 시가총액
- price : 주가

In [89]:
df2 = pd.read_excel("../Data/stock valuation.xlsx")
df2

Unnamed: 0,id,name,eps,bps,per,pbr
0,130960,CJ E&M,6301.333333,54068,15.695091,1.829178
1,136480,하림,274.166667,3551,11.489362,0.887074
2,138040,메리츠금융지주,2122.333333,14894,6.313806,0.899691
3,139480,이마트,18268.166667,295780,13.931338,0.860437
4,145990,삼양사,5741.0,108090,14.283226,0.758627
5,161390,한국타이어,5648.5,51341,7.453306,0.820007
6,181710,NHN엔터테인먼트,2110.166667,78434,30.755864,0.827447
7,185750,종근당,3990.333333,40684,25.185866,2.470259
8,204210,모두투어리츠,85.166667,5335,40.802348,0.651359
9,207940,삼성바이오로직스,4644.166667,60099,89.790059,6.938551


- id : 종목코드 
- name : 회사이름
- eps : 주당순이익
- bps : 주당 순자산가치
- per : 주가 수익비율
- pbr : 주가 자산비율

In [87]:
# 데이터프레임 합치기 - 교집합 
# id를 기준으로 합침 
merge_inner = pd.merge(df1, df2)
# default 값은 교집합
merge_inner

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
0,130960,CJ E&M,58540.666667,98900,CJ E&M,6301.333333,54068,15.695091,1.829178
1,139480,이마트,239230.833333,254500,이마트,18268.166667,295780,13.931338,0.860437
2,145990,삼양사,82750.0,82000,삼양사,5741.0,108090,14.283226,0.758627
3,185750,종근당,40293.666667,100500,종근당,3990.333333,40684,25.185866,2.470259
4,204210,모두투어리츠,3093.333333,3475,모두투어리츠,85.166667,5335,40.802348,0.651359


In [97]:
# 데이터 프레임 합치기 - 합집합 
merger_outer = pd.merge(df1, df2 , how = 'outer')
merger_outer

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
0,128940,한미약품,59385.666667,421000.0,,,,,
1,130960,CJ E&M,58540.666667,98900.0,CJ E&M,6301.333333,54068.0,15.695091,1.829178
2,138250,엔에스쇼핑,14558.666667,13200.0,,,,,
3,139480,이마트,239230.833333,254500.0,이마트,18268.166667,295780.0,13.931338,0.860437
4,142280,녹십자엠에스,468.833333,10200.0,,,,,
5,145990,삼양사,82750.0,82000.0,삼양사,5741.0,108090.0,14.283226,0.758627
6,185750,종근당,40293.666667,100500.0,종근당,3990.333333,40684.0,25.185866,2.470259
7,192400,쿠쿠홀딩스,179204.666667,177500.0,,,,,
8,199800,툴젠,-2514.333333,115400.0,,,,,
9,204210,모두투어리츠,3093.333333,3475.0,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [102]:
# 데이터 프레임 합치기 - 합집합 - 왼쪽 데이터 프레임 기준, 기준값은 회사명 
merger_left = pd.merge(df1, df2 , how = 'left', left_on = 'stock_name',right_on='name')
merger_left

Unnamed: 0,id_x,stock_name,value,price,id_y,name,eps,bps,per,pbr
0,128940,한미약품,59385.666667,421000,,,,,,
1,130960,CJ E&M,58540.666667,98900,130960.0,CJ E&M,6301.333333,54068.0,15.695091,1.829178
2,138250,엔에스쇼핑,14558.666667,13200,,,,,,
3,139480,이마트,239230.833333,254500,139480.0,이마트,18268.166667,295780.0,13.931338,0.860437
4,142280,녹십자엠에스,468.833333,10200,,,,,,
5,145990,삼양사,82750.0,82000,145990.0,삼양사,5741.0,108090.0,14.283226,0.758627
6,185750,종근당,40293.666667,100500,185750.0,종근당,3990.333333,40684.0,25.185866,2.470259
7,192400,쿠쿠홀딩스,179204.666667,177500,,,,,,
8,199800,툴젠,-2514.333333,115400,,,,,,
9,204210,모두투어리츠,3093.333333,3475,204210.0,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [103]:
# 데이터 프레임 합치기 - 합집합 - 왼쪽 데이터 프레임 기준, 기준값은 회사명 
merger_right = pd.merge(df1, df2 , how = 'right', left_on = 'stock_name',right_on='name')
merger_right

Unnamed: 0,id_x,stock_name,value,price,id_y,name,eps,bps,per,pbr
0,130960.0,CJ E&M,58540.666667,98900.0,130960,CJ E&M,6301.333333,54068,15.695091,1.829178
1,,,,,136480,하림,274.166667,3551,11.489362,0.887074
2,,,,,138040,메리츠금융지주,2122.333333,14894,6.313806,0.899691
3,139480.0,이마트,239230.833333,254500.0,139480,이마트,18268.166667,295780,13.931338,0.860437
4,145990.0,삼양사,82750.0,82000.0,145990,삼양사,5741.0,108090,14.283226,0.758627
5,,,,,161390,한국타이어,5648.5,51341,7.453306,0.820007
6,,,,,181710,NHN엔터테인먼트,2110.166667,78434,30.755864,0.827447
7,185750.0,종근당,40293.666667,100500.0,185750,종근당,3990.333333,40684,25.185866,2.470259
8,204210.0,모두투어리츠,3093.333333,3475.0,204210,모두투어리츠,85.166667,5335,40.802348,0.651359
9,,,,,207940,삼성바이오로직스,4644.166667,60099,89.790059,6.938551


In [None]:
# 데이터 프레임 합치기 - 합집합 - 왼쪽 데이터 프레임 기준, 기준값은 회사명 
merger_left = pd.merge(df1, df2 , how = 'left', left_on = 'stock_name',right_on='name')
merger_left

Unnamed: 0,id_x,stock_name,value,price,id_y,name,eps,bps,per,pbr
0,128940,한미약품,59385.666667,421000,,,,,,
1,130960,CJ E&M,58540.666667,98900,130960.0,CJ E&M,6301.333333,54068.0,15.695091,1.829178
2,138250,엔에스쇼핑,14558.666667,13200,,,,,,
3,139480,이마트,239230.833333,254500,139480.0,이마트,18268.166667,295780.0,13.931338,0.860437
4,142280,녹십자엠에스,468.833333,10200,,,,,,
5,145990,삼양사,82750.0,82000,145990.0,삼양사,5741.0,108090.0,14.283226,0.758627
6,185750,종근당,40293.666667,100500,185750.0,종근당,3990.333333,40684.0,25.185866,2.470259
7,192400,쿠쿠홀딩스,179204.666667,177500,,,,,,
8,199800,툴젠,-2514.333333,115400,,,,,,
9,204210,모두투어리츠,3093.333333,3475,204210.0,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [106]:
# df1에서 price가 50000 미만 인 데이터 찾기 
price= df1[df1['price'] < 50000]
price

Unnamed: 0,id,stock_name,value,price
2,138250,엔에스쇼핑,14558.666667,13200
4,142280,녹십자엠에스,468.833333,10200
9,204210,모두투어리츠,3093.333333,3475


In [107]:
# df2에 없는 price 대신 id 기준 
value = pd.merge(price , df2)
value

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
0,204210,모두투어리츠,3093.333333,3475,모두투어리츠,85.166667,5335,40.802348,0.651359


In [108]:
value.drop("name",axis=1,inplace=True)
value

Unnamed: 0,id,stock_name,value,price,eps,bps,per,pbr
0,204210,모두투어리츠,3093.333333,3475,85.166667,5335,40.802348,0.651359


---
# 그룹 연산 
- 복잡한 데이터를 어떤 기준에 따라 여러 그루으로 나눠서 관찰하는 것도 좋은 방법이다.
- 특정 기준을 적용하여 몇개의 그룹으로 분할하여 처리하는 과정이 그룹 연산이다.
- 1단계 : 분할 (split) => 데이터를 특정 조건에 의해 분할
- 2단계 : 적용 (apply) => 데이터를 집계, 변환, 필터링
- 3단계 : 결합 (combine) => 2단계 처리 결과를 하나로 결합

### 그룹 객체 만들기 (분할단계)

In [110]:
df = titanic.loc[:,['age','sex','class','fare','survived']]
df.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0


In [111]:
# class 열을 기준으로 분할
df['class'].unique()

['Third', 'First', 'Second']
Categories (3, object): ['First', 'Second', 'Third']

In [114]:
# class 기준 그룹화
grouped = df.groupby(['class'])
list(grouped)
# first 
list(grouped)[0]

('First',
       age     sex  class     fare  survived
 1    38.0  female  First  71.2833         1
 3    35.0  female  First  53.1000         1
 6    54.0    male  First  51.8625         0
 11   58.0  female  First  26.5500         1
 23   28.0    male  First  35.5000         1
 ..    ...     ...    ...      ...       ...
 871  47.0  female  First  52.5542         1
 872  33.0    male  First   5.0000         0
 879  56.0  female  First  83.1583         1
 887  19.0  female  First  30.0000         1
 889  26.0    male  First  30.0000         1
 
 [216 rows x 5 columns])

In [120]:
# 그룹객체를 iteration 으로 출력 : 처음부터 5줄만 출력
for key, group in grouped:
    print("* key : ", key)
    print("* number : ", len(group))
    print(group.head())
    print("=" * 60)

* key :  First
* number :  216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1
* key :  Second
* number :  184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1
* key :  Third
* number :  491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0


In [124]:
# key를 가져옴 
g3 = grouped.get_group("Third").head()

In [125]:
g3

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0


In [126]:
type(g3)

pandas.core.frame.DataFrame

In [123]:
# 연산 메소드 적용 
grouped.mean()
# class에 따라 sex object를 제외한 평균값 

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,38.233441,84.154687,0.62963
Second,29.87763,20.662183,0.472826
Third,25.14062,13.67555,0.242363


In [132]:
# class와 sex 기준으로 분할 
grouped_two = df.groupby(['class','sex'])
list(grouped_two)

[(('First', 'female'),
        age     sex  class      fare  survived
  1    38.0  female  First   71.2833         1
  3    35.0  female  First   53.1000         1
  11   58.0  female  First   26.5500         1
  31    NaN  female  First  146.5208         1
  52   49.0  female  First   76.7292         1
  ..    ...     ...    ...       ...       ...
  856  45.0  female  First  164.8667         1
  862  48.0  female  First   25.9292         1
  871  47.0  female  First   52.5542         1
  879  56.0  female  First   83.1583         1
  887  19.0  female  First   30.0000         1
  
  [94 rows x 5 columns]),
 (('First', 'male'),
        age   sex  class      fare  survived
  6    54.0  male  First   51.8625         0
  23   28.0  male  First   35.5000         1
  27   19.0  male  First  263.0000         0
  30   40.0  male  First   27.7208         0
  34   28.0  male  First   82.1708         0
  ..    ...   ...    ...       ...       ...
  839   NaN  male  First   29.7000         1
  8

In [136]:
for key, group in grouped_two:
    print("* key : ", key)
    print("* number : ", len(group))
    print(group.head())
    print("=" * 60)

* key :  ('First', 'female')
* number :  94
     age     sex  class      fare  survived
1   38.0  female  First   71.2833         1
3   35.0  female  First   53.1000         1
11  58.0  female  First   26.5500         1
31   NaN  female  First  146.5208         1
52  49.0  female  First   76.7292         1
* key :  ('First', 'male')
* number :  122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
30  40.0  male  First   27.7208         0
34  28.0  male  First   82.1708         0
* key :  ('Second', 'female')
* number :  76
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
41  27.0  female  Second  21.0000         0
43   3.0  female  Second  41.5792         1
53  29.0  female  Second  26.0000         1
* key :  ('Second', 'male')
* number :  108
     age   sex   class  fare  survived
17   

In [137]:
# 평균
grouped_two.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,female,34.611765,106.125798,0.968085
First,male,41.281386,67.226127,0.368852
Second,female,28.722973,21.970121,0.921053
Second,male,30.740707,19.741782,0.157407
Third,female,21.75,16.11881,0.5
Third,male,26.507589,12.661633,0.135447


In [154]:
# grouped_two에서 Third의 female만 group3f라는 dataframe 만들기
group3f = grouped_two.get_group(("Third","female"))
# tuple로 묶어서 key 값 그대로 가져오기 
group3f

Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.9250,1
8,27.0,female,Third,11.1333,1
10,4.0,female,Third,16.7000,1
14,14.0,female,Third,7.8542,0
18,31.0,female,Third,18.0000,0
...,...,...,...,...,...
863,,female,Third,69.5500,0
875,15.0,female,Third,7.2250,1
882,22.0,female,Third,10.5167,0
885,39.0,female,Third,29.1250,0
