## 데이터전처리 P.269
 - https://github.com/wesm/pydata-book/blob/2nd-edition/ch07.ipynb

### 7.1.1 누락된 데이터 처리하기

1. dropna()
 - Na 값을 하나라도 포함한다면 해당 로우 삭제
2. fillna()
 - 원하는 값으로 결측치를 채운다 

In [16]:
import pandas as pd
import numpy as np

In [23]:
string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado']) #np.nan은 값을 Na값으로 처리
print(string_data)
string_data.isnull()
string_data[0] = None #None 값도 Na 값으로 취급
string_data.isnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


0     True
1    False
2     True
3    False
dtype: bool

---
#### 7.1.2 누락된 데이터 골라내기
---

In [61]:
from numpy import nan as NA

data = pd.Series([1,NA,3.5,NA,7])
print(data)
print(data.dropna())
print(data[data.notnull()])

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
0    1.0
2    3.5
4    7.0
dtype: float64
0    1.0
2    3.5
4    7.0
dtype: float64


In [62]:
data = pd.DataFrame([[1.,6.5,3.],[1., NA, NA],[NA,NA,NA],[NA,6.5,3.]])
data
cleaned = data.dropna() #NA을 하나라도 포함하고 있으면 해당되는 로우 제외
print(data)
print()
print(cleaned)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0

     0    1    2
0  1.0  6.5  3.0


In [63]:
data.dropna(how='all') #모두 NA 값인 로우만 제외


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [64]:
data[4] = NA
data
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [65]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
df.dropna()
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.098661,,0.439349
3,0.139121,,-0.98582
4,-0.48681,-0.500433,-0.872683
5,0.549699,1.513889,0.286756
6,-2.926551,-0.526706,-1.238468


---
#### 7.1.3 결측치 채우기 p274
---

In [66]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.348834,0.0,0.0
1,-0.10779,0.0,0.0
2,0.098661,0.0,0.439349
3,0.139121,0.0,-0.98582
4,-0.48681,-0.500433,-0.872683
5,0.549699,1.513889,0.286756
6,-2.926551,-0.526706,-1.238468


In [67]:
df.fillna({1: 0.5, 2: 0}) # 해당되는 로우: 값 으로 채우기

Unnamed: 0,0,1,2
0,-1.348834,0.5,0.0
1,-0.10779,0.5,0.0
2,0.098661,0.5,0.439349
3,0.139121,0.5,-0.98582
4,-0.48681,-0.500433,-0.872683
5,0.549699,1.513889,0.286756
6,-2.926551,-0.526706,-1.238468


In [68]:
_ = df.fillna(0, inplace=True) # 그냥 0으로 때려박기
df

Unnamed: 0,0,1,2
0,-1.348834,0.0,0.0
1,-0.10779,0.0,0.0
2,0.098661,0.0,0.439349
3,0.139121,0.0,-0.98582
4,-0.48681,-0.500433,-0.872683
5,0.549699,1.513889,0.286756
6,-2.926551,-0.526706,-1.238468


In [69]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.382826,-1.279451,0.335231
1,-1.272679,1.553981,0.860533
2,-0.996953,1.553981,1.20991
3,0.808169,1.553981,-0.582268
4,-0.886447,,-0.582268
5,-0.203934,,-0.582268


In [70]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### 7.2 데이터 변형

#### 7.2.1. 중복 제거하기

In [40]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [42]:
data.duplicated() #이전 로우와 중복 검사, 중복되면 True

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [44]:
data.drop_duplicates() # 중복되는 항목만 제외

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [46]:
data['v1'] = range(7) ## 중복되는 값 삭제 
print(data)
data.drop_duplicates(['k1'])

    k1  k2  v1
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [48]:
data.drop_duplicates(['k1', 'k2'], keep='last') #마지막으로 발견된 값을 반환

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


#### 7.2.2.함수나 매핑을 이용한 데이터 변형

In [49]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [50]:

meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}


In [51]:
lowercased = data['food'].str.lower() #data['food'] 값들 모두 소문자로 변경
print(lowercased)
data['animal'] = lowercased.map(meat_to_animal)
data

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [53]:
data['food'].map(lambda x: meat_to_animal[x.lower()]) # meat_to_animal에 매핑된 애들 출력

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

#### 7.2.3. 값 치환

In [54]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [55]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [56]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [57]:
data.replace([-999, -1000], [np.nan, 0]) # -999는 Nan,  -1000은 0으로 ,

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [59]:
data.replace({-999: np.nan, -1000: 0})# -999는 Nan,  -1000은 0으로 ,

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 타이타닉 데이터를 통한 누락 데이터 처리

In [1]:
import seaborn as sns

In [71]:
#누락데이터 찾기
df = sns.load_dataset('titanic')
df['deck'].value_counts(dropna=False)
nan_deck_count = df['deck'].value_counts(dropna=False) #False일경우 누락된 데이터도 포함
print(nan_deck_count)


NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64


In [72]:
#데이터프레임의 데이터요소별로 NA체크 - isnull(), notnull()
print(df.head())
print(df.head().isnull())
print(df.head().notnull())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
   survived  pclass    sex    age  sibsp  parch   fare  embarked  class  \
0     False   False  False  False  False  False  False     False  False   
1     False   False  False  False  False  False  False     False  Fal

In [74]:
#누락 데이터 개수 구하기
print(df.isnull().sum(axis=0))
print(df.isnull())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
     survived  pclass    sex    age  sibsp  parch   fare  embarked  class  \
0       False   False  False  False  False  False  False     False  False   
1       False   False  False  False  False  False  False     False  False   
2       False   False  False  False  False  False  False     False  False   
3       False   False  False  False  False  False  False     False  False   
4       False   False  False  False  False  False  False     False  False   
..        ...     ...    ...    ...    ...    ...    ...       ...    ...   
886     False   False  False  False  False  False  False     False  False   
887     False   False  False  False  False  False  False     False  False   
888     F

In [80]:
#누락 데이터 대체하기
# age, deck, embark_town 에서 있다. 
# float , 카테고리, 오브젝트
print(df.dtypes)

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object


In [79]:
print(df)

     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male deck  embark_town alive  alo

In [82]:
df.replace([np.nan], [0.0], inplace=True)
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,0,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,0,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,0,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,0,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,0.0,1,2,23.4500,S,Third,woman,False,0,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True
