# 부분집합을 이용해 변수 간 논리적 불일치 찾기
- 데이터 : 미국 종단 조사(NLS), 미국 노동통계국이 주관해 고등학생을 대상으로 벌인 종단 조사(1997년)
- 연도별 근무 주 수(2000~2017), 대학 등록 여부 월별(1997.02 ~ 2017.10) 기록

In [3]:
import pandas as pd
import numpy as np

nls97 = pd.read_csv('data/nls97.csv')
nls97.set_index('personid', inplace=True)

In [5]:
nls97.head()

Unnamed: 0_level_0,gender,birthmonth,birthyear,highestgradecompleted,maritalstatus,childathome,childnotathome,wageincome,weeklyhrscomputer,weeklyhrstv,...,colenrfeb13,colenroct13,colenrfeb14,colenroct14,colenrfeb15,colenroct15,colenrfeb16,colenroct16,colenrfeb17,colenroct17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100061,Female,5,1980,13.0,Married,4.0,0.0,12500.0,10 hours or more a week,11 to 20 hours a week,...,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled
100139,Male,9,1983,12.0,Married,2.0,0.0,120000.0,1 to 3 hours a week,3 to 10 hours a week,...,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled
100284,Male,11,1984,7.0,Never-married,1.0,0.0,58000.0,,11 to 20 hours a week,...,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled
100292,Male,4,1982,,,,,,,,...,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,,,,
100583,Male,1,1980,13.0,Married,4.0,0.0,30000.0,Less than 1 hour a week,3 to 10 hours a week,...,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled


In [9]:
nls97[['wageincome', 'highestgradecompleted', 'highestdegree']].head(3).T

personid,100061,100139,100284
wageincome,12500.0,120000.0,58000.0
highestgradecompleted,13.0,12.0,7.0
highestdegree,2. High School,2. High School,0. None


In [10]:
nls97.loc[:, 'weeksworked12':'weeksworked17'].head(3).T

personid,100061,100139,100284
weeksworked12,40.0,52.0,0.0
weeksworked13,52.0,52.0,
weeksworked14,52.0,52.0,11.0
weeksworked15,52.0,52.0,52.0
weeksworked16,48.0,53.0,47.0
weeksworked17,48.0,52.0,0.0


In [11]:
nls97.loc[:, 'colenroct09':'colenrfeb14'].head(3).T

personid,100061,100139,100284
colenroct09,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb10,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct10,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb11,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct11,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb12,3. 4-year college,1. Not enrolled,1. Not enrolled
colenroct12,3. 4-year college,1. Not enrolled,1. Not enrolled
colenrfeb13,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct13,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb14,1. Not enrolled,1. Not enrolled,1. Not enrolled


#### (1) 임금 소득이 있지만 근무 주 수가 없는 사람

In [12]:
nls97.loc[(nls97.weeksworked16 == 0) & (nls97.wageincome > 0), ['weeksworked16', 'wageincome']]

Unnamed: 0_level_0,weeksworked16,wageincome
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
102625,0.0,1200.0
109403,0.0,5000.0
118704,0.0,25000.0
130701,0.0,12000.0
131151,0.0,65000.0
...,...,...
957344,0.0,90000.0
966697,0.0,65000.0
969334,0.0,5000.0
991756,0.0,9000.0


#### (2) 4년제 대학에 등록한 적이 있는 사람

In [24]:
#"colenr"이라는 문자열을 포함하는 열 이름 > 첫글자 검사(3이면 4년제 대학등록 true) > 이전 단계에서 반환된 값에 true가 있었는지 any함수로 검사(axis = 1)
nls97.filter(like='colenr').apply(lambda x : x.str[0:1]=='3').any(axis = 1)

personid
100061     True
100139    False
100284    False
100292     True
100583     True
          ...  
999291     True
999406     True
999543    False
999698    False
999963     True
Length: 8984, dtype: bool

#### (3) 대학원에 등록했지만 학사 과정 등록 데이터는 없는 사람
- 첫 글자가 4는 있지만 3이 없는 개인

In [27]:
nobach = nls97.loc[nls97.filter(like = 'colenr').apply(lambda x: x.str[0:1]=='4').any(axis = 1) 
                   & ~nls97.filter(like = 'colenr').apply(lambda x: x.str[0:1]=='3').any(axis = 1), 'colenrfeb97':'colenroct17']

In [29]:
len(nobach)

22

In [30]:
nobach.head(3).T

personid,153051,154535,184721
colenrfeb97,,,
colenroct97,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct00,2. 2-year college,1. Not enrolled,1. Not enrolled
colenrfeb01,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct01,2. 2-year college,1. Not enrolled,1. Not enrolled


#### (4) 학사 학위 이상을 가졌지만 4년제 대학에 등록한 기록이 없는 사람

In [35]:
nls97.highestdegree.value_counts(sort = False)

2. High School     3667
0. None             953
4. Bachelors       1673
3. Associates       737
5. Masters          603
1. GED             1146
7. Professional     120
6. PhD               54
Name: highestdegree, dtype: int64

In [41]:
no4yearsenrollment = nls97.loc[nls97.highestdegree.str[0:1].isin(['4', '5', '6', '7']) 
                               & ~nls97.filter(like='colenr').apply(lambda x : x.str[0:1]=='3').any(axis=1), 'colenrfeb97':'colenroct17']

In [42]:
len(no4yearsenrollment)

39

In [43]:
no4yearsenrollment.head(3).T

personid,113486,118749,124616
colenrfeb97,1. Not enrolled,,1. Not enrolled
colenroct97,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct98,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct99,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct00,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenrfeb01,1. Not enrolled,1. Not enrolled,1. Not enrolled
colenroct01,2. 2-year college,1. Not enrolled,1. Not enrolled


#### (5) 임금소득이 높은 사람

In [48]:
highwages = nls97.loc[nls97.wageincome > nls97.wageincome.mean() + (nls97.wageincome.std()*3), ['wageincome']]

In [49]:
highwages

Unnamed: 0_level_0,wageincome
personid,Unnamed: 1_level_1
131858,235884.0
133619,235884.0
151863,235884.0
164058,235884.0
164897,235884.0
...,...
964406,235884.0
966024,235884.0
976141,235884.0
983819,235884.0


#### (6) 최근 연도에 근무 주 수의 변동이 큰 개인
- 1. 개인별로 2012년에서 2016년까지 근무주 수의 평균값을 계산
- 2. 개인마다 열 평균 구해야하므로 axis = 1
- 3. 평균이 2017 근무 주 수의 50% 미만이거나 두 배를 초과하는지 확인
- 4. 2017 근무 주 수가 null인 행은 관심 없다고 표시

In [54]:
workchanges = nls97.loc[~nls97.loc[:, 'weeksworked12':'weeksworked16'].mean(axis=1)
          .between(nls97.weeksworked17 * 0.5, nls97.weeksworked17 * 2) & ~nls97.weeksworked17.isnull(), 'weeksworked12':'weeksworked17']

In [56]:
len(workchanges)

1160

In [57]:
workchanges.head(7).T

personid,100284,101526,101718,101724,102228,102454,102625
weeksworked12,0.0,0.0,52.0,52.0,52.0,52.0,14.0
weeksworked13,,0.0,9.0,52.0,52.0,52.0,3.0
weeksworked14,11.0,0.0,0.0,52.0,17.0,7.0,52.0
weeksworked15,52.0,0.0,32.0,17.0,0.0,0.0,44.0
weeksworked16,47.0,0.0,0.0,0.0,0.0,0.0,0.0
weeksworked17,0.0,45.0,0.0,17.0,0.0,0.0,0.0


#### (7) 이수 학년과 최종 학년 간 불일치
- 미국에서 12학년을 이수하지 못한 학생이 고등학교 졸업하는 것은 흔치 않음
- 이수 마지막 학년이 12미만인 사람의 최종학력을 확인해야함

In [59]:
ltgrade12 = nls97.loc[nls97.highestgradecompleted < 12, ['highestgradecompleted', 'highestdegree']]
ltgrade12.head()

Unnamed: 0_level_0,highestgradecompleted,highestdegree
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100284,7.0,0. None
101132,11.0,0. None
101230,11.0,0. None
101718,9.0,1. GED
101997,8.0,2. High School


In [60]:
pd.crosstab(ltgrade12.highestgradecompleted, ltgrade12.highestdegree)

highestdegree,0. None,1. GED,2. High School
highestgradecompleted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5.0,0,0,1
6.0,11,5,0
7.0,24,6,1
8.0,113,78,7
9.0,112,169,8
10.0,111,204,13
11.0,120,200,41


#### >> 여기까지 NLS 데이터에 여러가지 논리적 비일관성이 있음을 확인