# 누락 데이터 식별과 정제
---
- 데이터 : NLS, 주요 인구통계 열과 학교 성적 열의 누락값
- 누락값 채우기 전략 : 열 전체 평균 / 그룹 평균 / 가장 가까운 비누락값 할당

In [1]:
import pandas as pd
nls97 = pd.read_csv('data/nls97c.csv')
nls97.set_index('personid', inplace=True)

In [3]:
nls97.columns

Index(['gender', 'birthmonth', 'birthyear', 'highestgradecompleted',
       'maritalstatus', 'childathome', 'childnotathome', 'wageincome',
       'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep', 'satverbal',
       'satmath', 'gpaoverall', 'gpaenglish', 'gpamath', 'gpascience',
       'highestdegree', 'govprovidejobs', 'govpricecontrols', 'govhealthcare',
       'govelderliving', 'govindhelp', 'govunemp', 'govincomediff',
       'govcollegefinance', 'govdecenthousing', 'govprotectenvironment',
       'weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03',
       'weeksworked04', 'weeksworked05', 'weeksworked06', 'weeksworked07',
       'weeksworked08', 'weeksworked09', 'weeksworked10', 'weeksworked11',
       'weeksworked12', 'weeksworked13', 'weeksworked14', 'weeksworked15',
       'weeksworked16', 'weeksworked17', 'colenrfeb97', 'colenroct97',
       'colenrfeb98', 'colenroct98', 'colenrfeb99', 'colenroct99',
       'colenrfeb00', 'colenroct00', 'colenrfeb01', 'col

In [6]:
# 학교 레코드와 인구통계 데이터프레임 추출
schoolrecordlist = ['satverbal', 'satmath', 'gpaoverall', 'gpaenglish', 'gpamath',
                   'gpascience', 'highestdegree', 'highestgradecompleted']

In [7]:
demolist = ['maritalstatus', 'childathome', 'childnotathome', 'wageincome', 'weeklyhrscomputer',
           'weeklyhrstv', 'nightlyhrssleep']

In [9]:
schoolrecord = nls97[schoolrecordlist]
demo = nls97[demolist]

In [10]:
schoolrecord.shape

(8984, 8)

In [11]:
demo.shape

(8984, 7)

### 누락값 확인

In [12]:
schoolrecord.isnull().sum(axis=0)

satverbal                7578
satmath                  7577
gpaoverall               2980
gpaenglish               3186
gpamath                  3218
gpascience               3300
highestdegree              31
highestgradecompleted    2321
dtype: int64

In [15]:
misscnt = schoolrecord.isnull().sum(axis=1)
misscnt

personid
100061    2
100139    6
100284    6
100292    3
100583    2
         ..
999291    0
999406    0
999543    6
999698    6
999963    0
Length: 8984, dtype: int64

In [17]:
misscnt.value_counts().sort_index()

0    1087
1     312
2    3210
3    1102
4     176
5     101
6    2039
7     946
8      11
dtype: int64

In [19]:
schoolrecord.loc[misscnt >= 7].head(4).T

personid,101705,102061,102648,104627
satverbal,,,,
satmath,,,,
gpaoverall,,,,
gpaenglish,,,,
gpamath,,,,
gpascience,,,,
highestdegree,1. GED,0. None,1. GED,0. None
highestgradecompleted,,,,


### 데이터가 거의 없는 행 제거
- dropna 메서드, thresh=2는 비누락값이 2개 미만인 행(누락값이 7~8개인 행)이 삭제됨

In [21]:
schoolrecord = schoolrecord.dropna(thresh=2)

In [22]:
schoolrecord.shape

(8027, 8)

In [23]:
schoolrecord.isnull().sum(axis=1).value_counts().sort_index()

0    1087
1     312
2    3210
3    1102
4     176
5     101
6    2039
dtype: int64

### GPA 누락값을 평균으로 채우기

In [24]:
int(schoolrecord.gpaoverall.mean())

2

In [25]:
schoolrecord.gpaoverall.isnull().sum()

2023

In [26]:
schoolrecord.gpaoverall.fillna(int(schoolrecord.gpaoverall.mean()), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schoolrecord.gpaoverall.fillna(int(schoolrecord.gpaoverall.mean()), inplace=True)


In [27]:
schoolrecord.gpaoverall.isnull().sum()

0

### 정방향 채우기로 누락값을 대체
- fillna의 ffill옵션 : 누락값을 이전에 나온 가장 가까운 비누락값으로 대체
- 누락값이 거의 없고, 데이터에 무작위로 분포할 때도 정방향 채우기가 적합할 수 있음

In [28]:
demo.wageincome.head().T

personid
100061     12500.0
100139    120000.0
100284     58000.0
100292         NaN
100583     30000.0
Name: wageincome, dtype: float64

In [29]:
demo.wageincome.isnull().sum()

3893

In [31]:
nls97.wageincome.fillna(method='ffill', inplace=True)

In [32]:
demo = nls97[demolist]

In [33]:
demo.wageincome.head().T

personid
100061     12500.0
100139    120000.0
100284     58000.0
100292     58000.0
100583     30000.0
Name: wageincome, dtype: float64

In [34]:
demo.wageincome.isnull().sum()

0

### 누락값을 그룹별 평균으로 채우기
- 학력별 2017년 근무 주 수 평균값을 포함하는 데이터프레임 생성 ->> NLS 데이터와 병합
- 근무 주 수의 누락값을 fillna를 사용해 학력별 평균으로 대체

In [36]:
nls97[['highestdegree', 'weeksworked17']].head()

Unnamed: 0_level_0,highestdegree,weeksworked17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,2. High School,48.0
100139,2. High School,52.0
100284,0. None,0.0
100292,4. Bachelors,
100583,2. High School,52.0


In [39]:
workbydegree = nls97.groupby(['highestdegree'])['weeksworked17'].mean().reset_index().rename(columns={'weeksworked17':'meanweeksworked17'})

In [41]:
nls97 = nls97.reset_index().merge(workbydegree, on = ['highestdegree'], how='left').set_index('personid')

In [43]:
nls97.weeksworked17.fillna(nls97.meanweeksworked17, inplace = True)

In [44]:
nls97[['highestdegree', 'weeksworked17', 'meanweeksworked17']].head()

Unnamed: 0_level_0,highestdegree,weeksworked17,meanweeksworked17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,2. High School,48.0,38.150469
100139,2. High School,52.0,38.150469
100284,0. None,0.0,28.719608
100292,4. Bachelors,43.565574,43.565574
100583,2. High School,52.0,38.150469
