# DataFrame 정제
: NaN으로 빠진 값이나 정상적이지 않은값(결측치(missing value), 이상치)의 정제

In [1]:
import numpy  as np
import pandas as pd

In [3]:
# numpy의 난수를 이용하여 DataFrame 만들기
df = pd.DataFrame(np.random.rand(6,4))
# 6행 4열 
df

Unnamed: 0,0,1,2,3
0,0.678939,0.977138,0.881249,0.293891
1,0.163931,0.470898,0.80613,0.097861
2,0.16843,0.019579,0.010619,0.068347
3,0.721077,0.529059,0.117972,0.207069
4,0.905967,0.779815,0.409105,0.271275
5,0.066464,0.073004,0.530041,0.276184


In [6]:
# column, index 넣기 
df.columns = ["A","B","C","D"]
df.index = pd.date_range("20220701",periods=6)
# 시작날짜, 필요한 날짜 수 
df

Unnamed: 0,A,B,C,D
2022-07-01,0.678939,0.977138,0.881249,0.293891
2022-07-02,0.163931,0.470898,0.80613,0.097861
2022-07-03,0.16843,0.019579,0.010619,0.068347
2022-07-04,0.721077,0.529059,0.117972,0.207069
2022-07-05,0.905967,0.779815,0.409105,0.271275
2022-07-06,0.066464,0.073004,0.530041,0.276184


In [7]:
# index type 확인 
df.index
# datetime 
# string이 아님 

DatetimeIndex(['2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04',
               '2022-07-05', '2022-07-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
# 새로운 f열 생성과 값 입력 
df['F'] =[1.0,np.nan,3.5,6.1,np.nan,7.0]
df

Unnamed: 0,A,B,C,D,F
2022-07-01,0.678939,0.977138,0.881249,0.293891,1.0
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [11]:
#nan이 하나라도 있는 Data행 삭제 
df.dropna(how='any')
# default값, ()을 비워도 같음 
# nan이 없는 곳도 같이 삭제 

Unnamed: 0,A,B,C,D,F
2022-07-01,0.678939,0.977138,0.881249,0.293891,1.0
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [13]:
#  Nan이 모든 열에 데이터로 있는 경우의 삭제 
df.dropna(how='all')
# 데이터가 많을 때 추천 

Unnamed: 0,A,B,C,D,F
2022-07-01,0.678939,0.977138,0.881249,0.293891,1.0
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [16]:
# NaN을 특정 값으로 대체(평균,중앙값..)
df.fillna(value=5.0)

Unnamed: 0,A,B,C,D,F
2022-07-01,0.678939,0.977138,0.881249,0.293891,1.0
2022-07-02,0.163931,0.470898,0.80613,0.097861,5.0
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,5.0
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [17]:
# Boolean Index을 통해 검색 및 변경
df.isnull()

Unnamed: 0,A,B,C,D,F
2022-07-01,False,False,False,False,False
2022-07-02,False,False,False,False,True
2022-07-03,False,False,False,False,False
2022-07-04,False,False,False,False,False
2022-07-05,False,False,False,False,True
2022-07-06,False,False,False,False,False


In [22]:
# F열에서 NaN을 포함하고 있는 행 찾기
df.loc[df.F.isnull(),:]

Unnamed: 0,A,B,C,D,F
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-05,0.905967,0.779815,0.409105,0.271275,


In [26]:
df.loc[df.isnull()['F'],:] #2

Unnamed: 0,A,B,C,D,F
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-05,0.905967,0.779815,0.409105,0.271275,


In [25]:
# index 이용 행제거 
df.drop("2022-07-01")

Unnamed: 0,A,B,C,D,F
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [27]:
df.drop(pd.to_datetime('20220701'))

Unnamed: 0,A,B,C,D,F
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [28]:
df

Unnamed: 0,A,B,C,D,F
2022-07-01,0.678939,0.977138,0.881249,0.293891,1.0
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [31]:
# 2일 이상의 날짜 제거 
df.drop([pd.to_datetime("20220701"),pd.to_datetime("20220702")])

Unnamed: 0,A,B,C,D,F
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [32]:
# 열 삭제 
df.drop("F",axis=1) # 1방향 column
#axis='columns' 도 가능

Unnamed: 0,A,B,C,D
2022-07-01,0.678939,0.977138,0.881249,0.293891
2022-07-02,0.163931,0.470898,0.80613,0.097861
2022-07-03,0.16843,0.019579,0.010619,0.068347
2022-07-04,0.721077,0.529059,0.117972,0.207069
2022-07-05,0.905967,0.779815,0.409105,0.271275
2022-07-06,0.066464,0.073004,0.530041,0.276184


In [34]:
# 두개 이상의 열 삭제 
df.drop(['B','F'],axis=1)

Unnamed: 0,A,C,D
2022-07-01,0.678939,0.881249,0.293891
2022-07-02,0.163931,0.80613,0.097861
2022-07-03,0.16843,0.010619,0.068347
2022-07-04,0.721077,0.117972,0.207069
2022-07-05,0.905967,0.409105,0.271275
2022-07-06,0.066464,0.530041,0.276184


In [35]:
df

Unnamed: 0,A,B,C,D,F
2022-07-01,0.678939,0.977138,0.881249,0.293891,1.0
2022-07-02,0.163931,0.470898,0.80613,0.097861,
2022-07-03,0.16843,0.019579,0.010619,0.068347,3.5
2022-07-04,0.721077,0.529059,0.117972,0.207069,6.1
2022-07-05,0.905967,0.779815,0.409105,0.271275,
2022-07-06,0.066464,0.073004,0.530041,0.276184,7.0


In [55]:
# 새로운 데이터 프레임 만들기
data = [
    [1.4,np.nan],
    [7.1,-4.5],
    [np.nan,np.nan],
    [0.75,-1.3]
]


In [56]:
df = pd.DataFrame(data=data,index=['a','b','c','d'],columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [180]:
#2 
df = pd.DataFrame({'one':[1.4, 7.1, np.nan, 0.75], 
'two':[np.nan, -4.5, np.nan, -1.3]}, 
index = ['a','b','c','d'])

df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [57]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [58]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [64]:
df.loc['b'].sum()

2.5999999999999996

In [62]:
df.iloc[1][1]

-4.5

In [65]:
df.iloc[1].sum()

2.5999999999999996

In [66]:
# 열의 방향
df['one'].sum()

9.25

In [67]:
# row 
df.mean(axis=0)

one    3.083333
two   -2.900000
dtype: float64

In [68]:
#column
# nan가 하나만 들어가면 계산이 됨 
df.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [69]:
# nan이 하나라도 들어가는 것은 계산이 nan 
df.mean(axis=1,skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

:one 열의 NaN은 남은 값들의 평균으로 대체, two 열의 NaN은 가장 작은값으로 대체 

In [70]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [78]:
round(df.one.mean(axis=0),1)

3.1

In [76]:
df.two.min(axis=0)

-4.5

In [85]:
df.one.loc['c'] = round(df.one.mean(axis=0),1)
df.two.loc['c'] = df.two.min(axis=0)
df.two.loc['a'] = df.two.min(axis=0)
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.1,-4.5
d,0.75,-1.3


In [86]:
# 2
df['two'] = df['two'].fillna(value=df.two.min(axis=0))
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.1,-4.5
d,0.75,-1.3


---
# DataFrame Mergeing(병합)

In [93]:
df1 = pd.DataFrame({'key':list('bbacaad'),'data1':range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,d,6


In [91]:
df2 = pd.DataFrame({'key':list('abd'),'data1':range(3)})
df2

Unnamed: 0,key,data1
0,a,0
1,b,1
2,d,2


In [94]:
# 병합하기 
pd.merge(df1,df2, on='key')

Unnamed: 0,key,data1_x,data1_y
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,d,6,2


> key c와 d는 서로 match되지 않으므로 추력되지 않음

In [95]:
# 모두 보이기 
pd.merge(df1,df2,on='key',how='outer')

Unnamed: 0,key,data1_x,data1_y
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,a,4,0.0
4,a,5,0.0
5,c,3,
6,d,6,2.0


In [96]:
#df1 기준 
pd.merge(df1,df2,on='key',how='left')

Unnamed: 0,key,data1_x,data1_y
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,d,6,2.0


In [97]:
#df2 기준 
#df2 에 없는 c는 안보임 
pd.merge(df1,df2,on='key',how='right')

Unnamed: 0,key,data1_x,data1_y
0,a,2,0
1,a,4,0
2,a,5,0
3,b,0,1
4,b,1,1
5,d,6,2


## 두개의 DataFrame에 중복된 값이 있을 경우 

In [98]:
df1 = pd.DataFrame({'key':list('bbacab'),'data1':range(6)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [99]:
df2 = pd.DataFrame({'key':list('ababd'),'data2':range(5)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [100]:
pd.merge(df1,df2,on='key',how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [None]:
### key name이 다를 경우 

In [101]:
df1 = pd.DataFrame({'lkey':list('bbacab'),'data1':range(6)})
df2 = pd.DataFrame({'rkey':list('ababd'),'data2':range(5)})

In [103]:
pd.merge(df1,df2, left_on='lkey',right_on='rkey',how='inner')


Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,0,b,3
2,b,1,b,1
3,b,1,b,3
4,b,5,b,1
5,b,5,b,3
6,a,2,a,0
7,a,2,a,2
8,a,4,a,0
9,a,4,a,2


### 하나의 key값으로 병합하는 경우

In [105]:
df1 = pd.DataFrame({'key':list('bbacab'),'data1':range(6)})
df2 = pd.DataFrame({'group_val':[3.5,7]}, index=['a','b'])
df2


Unnamed: 0,group_val
a,3.5
b,7.0


In [108]:
pd.merge(df1,df2,left_on='key', right_index=True)
# df2는 key가 없고 인덱스만 있어서 병합기준이 df1의 key

Unnamed: 0,key,data1,group_val
0,b,0,7.0
1,b,1,7.0
5,b,5,7.0
2,a,2,3.5
4,a,4,3.5


---
### Data Concatenating(연결)

In [109]:
s1 = pd.Series([0,1],index=['a','b'])
s1

a    0
b    1
dtype: int64

In [110]:
s2 = pd.Series([2,3,4],index=['c','d','e'])

In [111]:
s3 = pd.Series([5,6],index=['f','g'])

In [112]:
#s1,s2,s3 합치기
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [113]:
# Series을 합치면서 DataFrame 만들기
pd.concat([s1,s2,s3],axis=1, sort=True)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [115]:
# 컬럼에 이름 넣기 
pd.concat([s1,s2,s3],axis=1,sort=True,keys=['s1','s2','s3'])

Unnamed: 0,s1,s2,s3
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


--- 
# DataFrame의 Concatenation

In [128]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [122]:
df2 = pd.DataFrame( 5 + np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [129]:
pd.concat([df1,df2],axis=1,sort=True,ignore_index=True)

Unnamed: 0,0,1,2,3
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [130]:
pd.concat([df1,df2],axis=0,sort=True,ignore_index=True)

Unnamed: 0,four,one,three,two
0,,0.0,,1.0
1,,2.0,,3.0
2,,4.0,,5.0
3,6.0,,5.0,
4,8.0,,7.0,


---
# 데이터프레임 중복값 제거

In [172]:
df = pd.DataFrame({'k1':['one']*3 + ['two'] *4 ,'k2':[1,1,2,3,3,4,4]})
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [173]:
# 중복값 확인
df.duplicated()
# 앞의 데이터와 중복이면 true

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [174]:
# 중복값 제거 
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [175]:
# 새로운 열 추가 
df['v1'] = np.arange(7)
df

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [176]:
# 조합 중복 제거 
df.drop_duplicates()

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [177]:
# k1 기준 중복값 제거 
df['k1'].drop_duplicates()

0    one
3    two
Name: k1, dtype: object

In [178]:
# k1의 값을 중복값 제거 
# 해당 열 칼럼도 보여줌 ,기본값 첫번째
df.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [179]:
# k1의 값을 마지막 값으로 출력 
df.drop_duplicates(['k1'],keep='last')

Unnamed: 0,k1,k2,v1
2,one,2,2
6,two,4,6


---
# Category 사용하기

In [156]:
df3 = pd.DataFrame({'id':[1,2,3,4,5,6],
'raw_grade': ['a','b','b','a','a','e']})
df3

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [157]:
# category 자료형으로 전환하기 
df3['grade'] = df3['raw_grade'].astype('category')
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [158]:
df3['raw_grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: raw_grade, dtype: object

In [159]:
df3['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [160]:
# category를 이용하여 값을 변형 
df3['grade'].cat.categories = ['very good','good','very bad']
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [164]:
df3.sort_values(by='grade')


Unnamed: 0,id,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bad


In [162]:
df3['grade']

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bad']

# 데이터 범위 정하기

In [165]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
# 18 ~ 25
# 26 ~27
# 네 범주 

In [167]:
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [168]:
# 범주 관련 code 보기
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [169]:
#빈도수 
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [171]:
# category 이름 정하기 
group_names = ['Youth','YoungAdult','Adult','Senior']
cat2 = pd.cut(ages,bins,labels=group_names)
cat2.value_counts()

Youth         5
YoungAdult    3
Adult         3
Senior        1
dtype: int64