# ~2.4 데이터 준비하기

In [1]:
import pandas as pd

In [2]:
housing = pd.read_csv('housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
# Stratified sampling
import numpy as np

# median_income --> 분류 데이터로 나누기
housing['income_cat'] = np.ceil(housing.median_income / 1.5)
housing['income_cat'].where(housing.income_cat < 5, 5.0, inplace = True)

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

In [7]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=42)
split
for train_index, test_index in split.split(housing ,housing.income_cat):
    strat_train_set = housing.iloc[train_index]
    strat_test_set = housing.iloc[test_index]

# 2.5 머신러닝 준비하기

In [8]:
# 학습데이터  features / labels 분리하기
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set.median_house_value.copy()

In [9]:
housing.info()  # total_bedrooms   NA 데이터 존재

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16354 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
ocean_proximity       16512 non-null object
income_cat            16512 non-null float64
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


In [20]:
# total_bedrooms  NA 데이터 확인
housing[housing.total_bedrooms.isna()]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
4629,-118.30,34.07,18.0,3759.0,,3296.0,1462.0,2.2708,<1H OCEAN,2.0
6068,-117.86,34.01,16.0,4632.0,,3038.0,727.0,5.1762,<1H OCEAN,4.0
17923,-121.97,37.35,30.0,1955.0,,999.0,386.0,4.6328,<1H OCEAN,4.0
13656,-117.30,34.05,6.0,2155.0,,1039.0,391.0,1.6675,INLAND,2.0
19252,-122.79,38.48,7.0,6837.0,,3468.0,1405.0,3.1662,<1H OCEAN,3.0
20267,-119.19,34.20,18.0,3620.0,,3171.0,779.0,3.3409,NEAR OCEAN,3.0
1606,-122.08,37.88,26.0,2947.0,,825.0,626.0,2.9330,NEAR BAY,2.0
4496,-118.22,34.05,34.0,1113.0,,928.0,290.0,3.1654,<1H OCEAN,3.0
16879,-122.39,37.60,34.0,707.0,,381.0,156.0,4.3750,NEAR OCEAN,3.0
2412,-119.45,36.61,24.0,1302.0,,693.0,243.0,3.7917,INLAND,3.0


### NA 데이터 정리하기

 - (교재) Imputer 활용

In [21]:
# 1. (교재) Imputer 활용
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='median')

In [22]:
housing_num = housing.drop('ocean_proximity', axis = 1)  # 명목형 데이터 삭제

In [26]:
imputer.fit(housing_num)  # 모든 열의 median 값을 계산한다
imputer.statistics_   # 마지막에  _ 도 적어줘야 함

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409,    3.    ])

In [28]:
housing_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,2.0
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,5.0
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,2.0
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,2.0
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,3.0


In [27]:
X = imputer.transform(housing_num)   # na 데이터를 median 값으로 정리한 배열. numpy 형태
X

array([[-121.89  ,   37.29  ,   38.    , ...,  339.    ,    2.7042,
           2.    ],
       [-121.93  ,   37.05  ,   14.    , ...,  113.    ,    6.4214,
           5.    ],
       [-117.2   ,   32.77  ,   31.    , ...,  462.    ,    2.8621,
           2.    ],
       ...,
       [-116.4   ,   34.09  ,    9.    , ...,  765.    ,    3.2723,
           3.    ],
       [-118.01  ,   33.82  ,   31.    , ...,  356.    ,    4.0625,
           3.    ],
       [-122.45  ,   37.77  ,   52.    , ...,  639.    ,    3.575 ,
           3.    ]])

In [29]:
housing_tr = pd.DataFrame(X, columns = housing_num.columns, index = list(housing.index.values))

In [30]:
housing_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,2.0
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,5.0
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,2.0
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,2.0
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,3.0


In [31]:
housing_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 9 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16512 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
income_cat            16512 non-null float64
dtypes: float64(9)
memory usage: 1.3 MB


 - fillna() 활용 방법

In [33]:
housing.median()
# array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
#         408.    ,    3.5409,    3.    ])

longitude             -118.5100
latitude                34.2600
housing_median_age      29.0000
total_rooms           2119.5000
total_bedrooms         433.0000
population            1164.0000
households             408.0000
median_income            3.5409
income_cat               3.0000
dtype: float64

In [39]:
housing_tr2 = housing.fillna(housing.median())
housing_tr2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16512 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
ocean_proximity       16512 non-null object
income_cat            16512 non-null float64
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


### 범주형 데이터 --> one-hot encoding

In [40]:
# 범주형 데이터에 임의의 숫자를 주어서 변환 --> 정량적으로 비교될 수 있기에 조심해야 함. 
# 데이터 특성에 따라서....

In [None]:
# 정량적으로 비교되기 어려운 범주형 데이터의 경우
# 값을 하나의 컬럼으로..( 1 or 0 값을 가지게)

In [47]:
# 범주형 데이터 --> 숫자 매칭
housing_cat = housing.ocean_proximity
housing_cat_encoded, housing_categories = housing_cat.factorize()
print(housing_cat_encoded)
print(housing_categories)

[0 0 1 ... 2 0 3]
Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')


In [48]:
# 벡터로 만들어주기
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))   # -1은  max 의미.   "행은 max로, 열은 1개만"

In [58]:
housing_cat_encoded.reshape(-1,1)

array([[0],
       [0],
       [1],
       ...,
       [2],
       [0],
       [3]], dtype=int64)

In [49]:
housing_cat_1hot   # sparse matrix

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [59]:
housing_cat_1hot.toarray()   # 배열로 변경

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

 - pd.get_dummies 활용하기

In [62]:
# pd.get_dummies로 One-hot 인코딩
housing_cat_1hot = pd.get_dummies(housing.ocean_proximity)
housing_cat_1hot.head()

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
17606,1,0,0,0,0
18632,1,0,0,0,0
14650,0,0,0,0,1
3230,0,1,0,0,0
3555,1,0,0,0,0


### 나만의 변환기??

 - (교재)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# 컬럼 인덱스
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs, 
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
housing_extra_attribs.head()

 - pandas 에서 특성 추가하기

In [None]:
# 다시 확인 필요함
# housing['rooms_per_household'] = housing.total_rooms / housing.households
# housing['population_per_household'] = housing.population / housing.total_rooms

### 스케일링(Scaling)
 - min-max 스케일링 : (d - min) / (max-min).   0~1 사이 값으로.  MinMaxScaler 
 - 표준화(Standardization) :   (d - mean) / 표준편차.   평균이 0, 분산이 1이 되게.   StandardScaler

# 그냥... 제 맘대로... 