In [2]:
import pandas as pd
import numpy as np
df = pd.DataFrame([[np.nan,2,np.nan,0],
                  [3,4,np.nan,1],
                  [np.nan,np.nan,np.nan,5],
                  [np.nan,3,np.nan,4]],
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


# 결측치 확인하기
* df.isnull().sum() : 컬럼별 null의 갯수 확인 가능
* df.info() : 전체 쌤플의 갯수 컬럼별 null이 아닌 갯수 확인 가능

In [4]:
#isnull()

df.isnull().sum()

A    3
B    1
C    4
D    0
dtype: int64

In [5]:
#info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       1 non-null      float64
 1   B       3 non-null      float64
 2   C       0 non-null      float64
 3   D       4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 260.0 bytes


# 결측치 삭제하기
* df.dropna() : 결측치가 존재하는 모든 행 삭제
* df.dropna(axis=1) : 결측치가 존재하는 모든 열 삭제

In [6]:
# 결측치가 존재하는 모든 행 삭제
# 실제 적용하려면 inplace=True 속성 필요
df.dropna()

Unnamed: 0,A,B,C,D


In [8]:
# 결측치가 존재하는 모든 열 삭제
df.dropna(axis=1)

Unnamed: 0,D
0,0
1,1
2,5
3,4


# 결측치 대치하기
* df.fillna(특정값) : 특정값으로 채우기
* df.fillna(method='ffill') : 이전값으로 채우기
* df.fillna(method='bfill') : 다음값으로 채우기
* df.fillna({컬럼명:값,...}) : 컬럼별로 값을 지정하여 채우기

## 특정값으로 채우기

In [9]:
# 0으로 채우기
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [13]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [11]:
# 평균값으로 채우기(컬럼별 평균값으로 채워진다.)
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,3.0,3.0,,5
3,3.0,3.0,,4


## 이전값으로 채우기

In [12]:
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,3.0,3.0,,4


## 다음값으로 채우기

In [14]:
df.fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,,3.0,,5
3,,3.0,,4


## 컬럼별로 값을 지정하여 채우기

In [16]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [17]:
df.fillna({'A':0,'B':1,'C':2,'D':3}) 

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5
3,0.0,3.0,2.0,4


# 결측치가 포함된 데이터의 통계값
* 결측치가 없는 데이터는 없는 데이터로 간주한다.

In [18]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [20]:
df['A'].mean()

3.0

In [22]:
df['B'].mean()

3.0

In [24]:
## scores 데이터의 결측치 처리
df = pd.read_csv('data/scores.csv')
df.head()

Unnamed: 0,name,kor,eng,math
0,1,5.0,5.0,10.0
1,2,0.0,0.0,10.0
2,3,10.0,10.0,10.0
3,4,6.0,6.0,10.0
4,5,10.0,10.0,10.0


In [26]:
# 결측치 확인하기
df.isnull().sum()

name    0
kor     5
eng     7
math    5
dtype: int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   name    1048575 non-null  int64  
 1   kor     1048570 non-null  float64
 2   eng     1048568 non-null  float64
 3   math    1048570 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 32.0 MB


In [28]:
# 결측치 0으로 채우기
df.fillna(0)

Unnamed: 0,name,kor,eng,math
0,1,5.0,5.0,10.0
1,2,0.0,0.0,10.0
2,3,10.0,10.0,10.0
3,4,6.0,6.0,10.0
4,5,10.0,10.0,10.0
...,...,...,...,...
1048570,120800,10.0,10.0,0.0
1048571,120801,10.0,10.0,0.0
1048572,120802,1.0,1.0,0.0
1048573,120803,10.0,10.0,0.0


In [29]:
df.fillna(0,inplace=True)

In [30]:
df.isnull().sum()

name    0
kor     0
eng     0
math    0
dtype: int64