# (4)집계 함수를 groupby와 함께 사용하기

In [2]:
import pandas as pd
nls97 = pd.read_csv('data/nls97b.csv')
nls97.set_index('personid', inplace=True)

In [4]:
# 데이터 구조 검토
nls97.iloc[:, 0:7].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8984 non-null   object 
 1   birthmonth             8984 non-null   int64  
 2   birthyear              8984 non-null   int64  
 3   highestgradecompleted  6663 non-null   float64
 4   maritalstatus          6672 non-null   object 
 5   childathome            4791 non-null   float64
 6   childnotathome         4791 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 561.5+ KB


In [6]:
# 범주형 데이터 검토
catvars = ['gender', 'maritalstatus', 'highestdegree']

for col in catvars:
    print(col, nls97[col].value_counts().sort_index(), sep='\n\n', end='\n\n\n')

gender

Female    4385
Male      4599
Name: gender, dtype: int64


maritalstatus

Divorced          663
Married          3066
Never-married    2766
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64


highestdegree

0. None             953
1. GED             1146
2. High School     3667
3. Associates       737
4. Bachelors       1673
5. Masters          603
6. PhD               54
7. Professional     120
Name: highestdegree, dtype: int64




In [7]:
# 기술통계 검토
contvars = ['satmath', 'satverbal', 'weeksworked06', 'gpaoverall', 'childathome']

nls97[contvars].describe()

Unnamed: 0,satmath,satverbal,weeksworked06,gpaoverall,childathome
count,1407.0,1406.0,8340.0,6004.0,4791.0
mean,500.590618,499.72404,38.429976,2.818408,1.85932
std,114.953309,112.166256,18.921281,0.616357,1.259053
min,7.0,14.0,0.0,0.1,0.0
25%,430.0,430.0,27.0,2.43,1.0
50%,500.0,500.0,51.0,2.86,2.0
75%,580.0,570.0,52.0,3.26,3.0
max,800.0,800.0,52.0,4.17,9.0


### 성별에 따른 SAT 수학 점수 확인

In [8]:
nls97.groupby('gender')['satmath'].mean()

gender
Female    486.647757
Male      516.875193
Name: satmath, dtype: float64

### 성별과 학력에 따른 SAT 수학 점수 확인

In [9]:
nls97.groupby(['gender', 'highestdegree'])['satmath'].mean()

gender  highestdegree  
Female  0. None            332.600000
        1. GED             405.000000
        2. High School     430.769231
        3. Associates      458.032787
        4. Bachelors       501.945513
        5. Masters         508.271523
        6. PhD             575.454545
        7. Professional    599.411765
Male    0. None            540.000000
        1. GED             320.000000
        2. High School     467.740586
        3. Associates      481.111111
        4. Bachelors       542.163793
        5. Masters         574.444444
        6. PhD             621.428571
        7. Professional    587.727273
Name: satmath, dtype: float64

### 성별 및 학력별 SAT 수학과 언어 점수 확인

In [10]:
nls97.groupby(['gender', 'highestdegree'])[['satmath', 'satverbal']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,satmath,satverbal
gender,highestdegree,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,0. None,332.6,408.8
Female,1. GED,405.0,390.0
Female,2. High School,430.769231,444.314917
Female,3. Associates,458.032787,466.229508
Female,4. Bachelors,501.945513,506.294872
Female,5. Masters,508.271523,533.927152
Female,6. PhD,575.454545,558.181818
Female,7. Professional,599.411765,587.058824
Male,0. None,540.0,483.333333
Male,1. GED,320.0,360.0


### 건수, 평균, 최댓값, 표준편차 열 추가
- agg함수를 사용해 요약통계 반환

In [12]:
nls97.groupby(['gender', 'highestdegree'])['gpaoverall'].agg(['count', 'mean', 'max', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max,std
gender,highestdegree,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0. None,148,2.456419,4.0,0.669446
Female,1. GED,227,2.310132,3.91,0.65809
Female,2. High School,1212,2.77486,4.17,0.530336
Female,3. Associates,290,2.943483,4.0,0.495803
Female,4. Bachelors,734,3.238529,4.07,0.466571
Female,5. Masters,312,3.296186,4.08,0.432133
Female,6. PhD,22,3.46,4.0,0.451885
Female,7. Professional,53,3.537736,4.11,0.413548
Male,0. None,193,2.218342,4.0,0.643481
Male,1. GED,345,2.242145,4.0,0.631721


### 딕셔너리를 사용해 복잡한 집계 수행

In [14]:
# Pandas가 데이터프레임 등을 출력할 때 부동 소수점 숫자의 형식이 변경
# 소수점 아래 1자리까지 표시하고, 천 단위로 쉼표를 사용하여 천 단위 구분 기호
pd.options.display.float_format = '{:,.1f}'.format

In [15]:
aggdict = {'weeksworked06':['count', 'mean', 'max', 'std'], 'childathome':['count', 'mean', 'max', 'std']}

In [16]:
nls97.groupby(['highestdegree']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,max,std,count,mean,max,std
highestdegree,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0. None,703,29.7,52.0,21.6,439,1.8,8.0,1.6
1. GED,1104,33.2,52.0,20.6,693,1.7,9.0,1.5
2. High School,3368,39.4,52.0,18.6,1961,1.9,7.0,1.3
3. Associates,722,40.7,52.0,17.7,428,2.0,6.0,1.1
4. Bachelors,1642,42.2,52.0,16.1,827,1.9,8.0,1.0
5. Masters,601,42.2,52.0,16.1,333,1.9,5.0,0.9
6. PhD,53,38.2,52.0,18.6,32,2.1,6.0,1.1
7. Professional,117,27.1,52.0,20.4,57,1.8,4.0,0.8


In [17]:
nls97.groupby(['maritalstatus']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,max,std,count,mean,max,std
maritalstatus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Divorced,660,37.5,52.0,19.1,524,1.5,5.0,1.2
Married,3033,40.3,52.0,17.9,2563,2.1,8.0,1.1
Never-married,2734,37.2,52.0,19.1,1502,1.6,9.0,1.3
Separated,153,33.8,52.0,20.2,137,1.5,8.0,1.4
Widowed,23,37.1,52.0,19.3,18,1.8,5.0,1.4
