## 그룹함수 및 피봇 테이블 이용 분석 예제
### 식당에서 식사 후 내는 팁(tip)과 관련된 데이터이용
#### seaborn 패키지 내 tips 데이터셋 사용
##### -  total_bill: 식사대금
##### -  tip: 팁
##### -  sex: 성별
##### -  smoker: 흡연/금연 여부
##### -  day: 요일
##### -  time: 시간
##### -  size: 인원


In [8]:
# 모든 변수 출력
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
tips = sns.load_dataset("tips")
tips.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


### 식사 대금 대비 팁의 비율이 언제 가장 높아지는가?
##### 가공 필드 생성 : 식사대금 대비 팁의 비율
###### - tip_pt = 팁 / 식사대금


In [5]:
tips_pt = tips["tip"]/tips["total_bill"]
tips["tips_pt"] = tips_pt
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_pt
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [6]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   tips_pt     244 non-null    float64 
dtypes: category(4), float64(3), int64(1)
memory usage: 9.2 KB


In [7]:
tips.describe()

Unnamed: 0,total_bill,tip,size,tips_pt
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,0.160803
std,8.902412,1.383638,0.9511,0.061072
min,3.07,1.0,1.0,0.035638
25%,13.3475,2.0,2.0,0.129127
50%,17.795,2.9,2.0,0.15477
75%,24.1275,3.5625,3.0,0.191475
max,50.81,10.0,6.0,0.710345


In [11]:
# 성별 인원수를 계산
tips["sex"].value_counts()

Male      157
Female     87
Name: sex, dtype: int64

In [12]:
# 성별 인원수를 계산
tips.groupby(by = "sex").count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size,tips_pt
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Male,157,157,157,157,157,157,157
Female,87,87,87,87,87,87,87


In [18]:
# 흡현 유무에 따른 성별 인원
tips.groupby(by = ["smoker","sex"]).size()

smoker  sex   
Yes     Male      60
        Female    33
No      Male      97
        Female    54
dtype: int64

In [23]:
# 흡연 유무에 따른 성별 인원을 피봇테이블로 구현
tips.pivot_table("tip",index = "sex", columns = "smoker",aggfunc = "count")

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [26]:
# 성별 팁 비율의 평균
tips.groupby(by = "sex").mean()["tips_pt"]
# 여성이 식사금액 대비 팁 비율의 평균이 근소하게 높다

sex
Male      0.157651
Female    0.166491
Name: tips_pt, dtype: float64

In [28]:
# 흡연 유무에 따른 팁 비율의 평균
tips.groupby(by = "smoker")["tips_pt"].mean()
# 흡연자가 비 흡연자에 비해 팁비율이 근소하게 높다

smoker
Yes    0.163196
No     0.159328
Name: tips_pt, dtype: float64

In [36]:
# 성별과 흡연 유무에 따른 팁 비율의 평균
# groupby 이용
tips.groupby(by = ["sex","smoker"])["tips_pt"].mean()

sex     smoker
Male    Yes       0.152771
        No        0.160669
Female  Yes       0.182150
        No        0.156921
Name: tips_pt, dtype: float64

In [37]:
# 성별과 흡연 유무에 따른 팁 비율의 평균
# pivot table 이용
tips.pivot_table(index = ["sex","smoker"],values = "tips_pt" ,aggfunc = "mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,tips_pt
sex,smoker,Unnamed: 2_level_1
Male,Yes,0.152771
Male,No,0.160669
Female,Yes,0.18215
Female,No,0.156921


In [39]:
# 성별과 흡연 유무에 따른 팁 비율의 평균
# pivot table 이용
tips_piv = tips.pivot_table(index = ["sex"],columns = "smoker",values = "tips_pt" ,aggfunc = "mean")
tips_piv

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,0.152771,0.160669
Female,0.18215,0.156921


In [48]:
# 요일별 손님 수
# 요일 값 종류 확인
tips["day"].unique()

tips.groupby(by = "day").count()["total_bill"]

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Sun', 'Sat', 'Thur', 'Fri']

day
Thur    62
Fri     19
Sat     87
Sun     76
Name: total_bill, dtype: int64

In [54]:
# 요일별 여성/남성 손님 비율
# groupby 이용
tips.groupby(by = ["day","sex"]).size()/len(tips)*100

day   sex   
Thur  Male      12.295082
      Female    13.114754
Fri   Male       4.098361
      Female     3.688525
Sat   Male      24.180328
      Female    11.475410
Sun   Male      23.770492
      Female     7.377049
dtype: float64

In [58]:
# 요일별 테이블당 평균 인원 수
tips.groupby(by = ["day"]).mean()["size"]

day
Thur    2.451613
Fri     2.105263
Sat     2.517241
Sun     2.842105
Name: size, dtype: float64

In [61]:
# 요일별 평균 팁 비율
tips.groupby(by = "day")["tips_pt"].mean()

day
Thur    0.161276
Fri     0.169913
Sat     0.153152
Sun     0.166897
Name: tips_pt, dtype: float64

In [62]:
# 요일별 평균 식사금액과 평균 팁
tips.groupby(by = "day").mean()[["total_bill","tip"]]

Unnamed: 0_level_0,total_bill,tip
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,17.682742,2.771452
Fri,17.151579,2.734737
Sat,20.441379,2.993103
Sun,21.41,3.255132
