### Как собрать конвейер обработки данных:

In [1]:
from datetime import datetime

start = datetime.now()

import pandas as pd
import numpy as np
import os

HOUSING_PATH = './data/'

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.dropna()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [2]:
# испортируем базовый класс и примесь
from sklearn.base import BaseEstimator, TransformerMixin

# [индекс колонки (переведенный в list) для колонки в списке колонок]
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    """Делаем свой класс"""
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
        """Init создает атрибут add_bedrooms_per_room -
        делать или нет новую фичу (количество спален на комнату)"""
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        """Не пользуемся, но метод должен быть"""
        return self  # nothing else to do
    def transform(self, X, y=None):
        """Трансформ тоже должен быть, в нем создаем новые фичи"""
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        # если параметр True, то в вывод попадет колонка bedrooms_per_room
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        # иначе - не попадет
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [3]:
# создаем инстанс класса
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# добавляем с помощью него 2 новые числовые фичи
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs

array([[-122.23, 37.88, 41.0, ..., 'NEAR BAY', 6.984126984126984,
        2.5555555555555554],
       [-122.22, 37.86, 21.0, ..., 'NEAR BAY', 6.238137082601054,
        2.109841827768014],
       [-122.24, 37.85, 52.0, ..., 'NEAR BAY', 8.288135593220339,
        2.8022598870056497],
       ...,
       [-121.22, 39.43, 17.0, ..., 'INLAND', 5.20554272517321,
        2.325635103926097],
       [-121.32, 39.43, 18.0, ..., 'INLAND', 5.329512893982808,
        2.1232091690544412],
       [-121.24, 39.37, 16.0, ..., 'INLAND', 5.254716981132075,
        2.616981132075472]], dtype=object)

In [4]:
# Можно еще проще - написать просто функцию, а потом создать на ее основе нужный класс!
from sklearn.preprocessing import FunctionTransformer

def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = FunctionTransformer(add_extra_features, validate=False,
                                 kw_args={"add_bedrooms_per_room": False})
housing_extra_attribs = attr_adder.fit_transform(housing.values)

In [5]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household
0,-122.23,37.88,41,880,129,322,126,8.3252,452600,NEAR BAY,6.98413,2.55556
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500,NEAR BAY,6.23814,2.10984
2,-122.24,37.85,52,1467,190,496,177,7.2574,352100,NEAR BAY,8.28814,2.80226
3,-122.25,37.85,52,1274,235,558,219,5.6431,341300,NEAR BAY,5.81735,2.54795
4,-122.25,37.85,52,1627,280,565,259,3.8462,342200,NEAR BAY,6.28185,2.18147


In [6]:
housing_extra_attribs.shape

(20640, 12)

In [7]:
# Создаем пайплайн обработки числовых фич
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])

housing_num = housing[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']]
housing_num_np = housing_num.to_numpy()
housing_num_tr = num_pipeline.fit_transform(housing_num_np)

In [8]:
housing_num_tr

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.62855945,
        -0.04959654, -1.1460242 ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.32704136,
        -0.09251223, -0.98725423],
       [-1.33282653,  1.03850269,  1.85618152, ...,  1.15562047,
        -0.02584253, -1.44051403],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.09031802,
        -0.0717345 ,  0.03680837],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.04021111,
        -0.09122515,  0.11820445],
       [-0.83369581,  1.75014627, -1.00430931, ..., -0.07044252,
        -0.04368215,  0.14049521]])

In [9]:
housing_num_tr.shape

(20640, 12)

In [10]:
# Теперь создадим класс для создания array из списка фичей

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [11]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat = housing[['ocean_proximity']]
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
# разреженная матрица
housing_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

In [12]:
# используем метод .toarray() для перехода к привычному массиву numpy
housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [13]:
housing_cat_1hot.toarray().shape

(20640, 5)

In [14]:
# пайплайн для категориальных переменных

cat_attrs = ['ocean_proximity']

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attrs)),
        ('encoder', OneHotEncoder()),
    ])

housing_cat_tr = cat_pipeline.fit_transform(housing)

In [15]:
# пайплайн для числовых переменных

num_attrs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attrs)),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),    
    ])

num_pipeline.fit_transform(housing)

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.62855945,
        -0.04959654, -1.1460242 ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.32704136,
        -0.09251223, -0.98725423],
       [-1.33282653,  1.03850269,  1.85618152, ...,  1.15562047,
        -0.02584253, -1.44051403],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.09031802,
        -0.0717345 ,  0.03680837],
       [-0.87362627,  1.77823747, -0.84539315, ..., -0.04021111,
        -0.09122515,  0.11820445],
       [-0.83369581,  1.75014627, -1.00430931, ..., -0.07044252,
        -0.04368215,  0.14049521]])

In [16]:
# Теперь можно объединить пайплайны для категориальных и числовых переменных!
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [17]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

<20640x17 sparse matrix of type '<class 'numpy.float64'>'
	with 268320 stored elements in Compressed Sparse Row format>

In [18]:
housing_prepared.toarray().shape

(20640, 17)

In [19]:
pd.DataFrame(housing_prepared.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-1.327835,1.052548,0.982143,-0.804819,-0.970325,-0.974429,-0.977033,2.344766,2.129631,0.628559,-0.049597,-1.146024,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.045890,1.348276,0.861439,1.669961,2.332238,1.314156,0.327041,-0.092512,-0.987254,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.825561,-0.820777,-0.843637,1.782699,1.258693,1.155620,-0.025843,-1.440514,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.718768,-0.766028,-0.733781,0.932968,1.165100,0.156966,-0.050329,-0.492925,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.611974,-0.759847,-0.629157,-0.012881,1.172900,0.344711,-0.085616,-0.706141,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.388895,-0.512592,-0.443449,-1.216128,-1.115804,-0.155023,-0.049110,0.199820,0.0,1.0,0.0,0.0,0.0
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.920488,-0.944405,-1.008420,-0.691593,-1.124470,0.276881,0.005021,0.037412,0.0,1.0,0.0,0.0,0.0
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.125472,-0.369537,-0.174042,-1.142593,-0.992746,-0.090318,-0.071735,0.036808,0.0,1.0,0.0,0.0,0.0
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.305834,-0.604429,-0.393753,-1.054583,-1.058608,-0.040211,-0.091225,0.118204,0.0,1.0,0.0,0.0,0.0


In [20]:
len(housing_extra_attribs.columns)

12

In [21]:
housing_extra_attribs.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'population_per_household'],
      dtype='object')

In [22]:
len(housing.ocean_proximity.unique())

5

In [23]:
list(housing_extra_attribs.columns) + list(housing.ocean_proximity.unique())

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity',
 'rooms_per_household',
 'population_per_household',
 'NEAR BAY',
 '<1H OCEAN',
 'INLAND',
 'NEAR OCEAN',
 'ISLAND']

In [72]:
housing_full = pd.DataFrame(housing_prepared.toarray(), 
                            columns=list(housing_extra_attribs.columns) + 
                            list(housing.ocean_proximity.unique())).dropna()
housing_full.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,NEAR BAY,<1H OCEAN,INLAND,NEAR OCEAN,ISLAND
0,-1.327835,1.052548,0.982143,-0.804819,-0.970325,-0.974429,-0.977033,2.344766,2.129631,0.628559,-0.049597,-1.146024,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.348276,0.861439,1.669961,2.332238,1.314156,0.327041,-0.092512,-0.987254,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.825561,-0.820777,-0.843637,1.782699,1.258693,1.15562,-0.025843,-1.440514,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.718768,-0.766028,-0.733781,0.932968,1.1651,0.156966,-0.050329,-0.492925,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.611974,-0.759847,-0.629157,-0.012881,1.1729,0.344711,-0.085616,-0.706141,0.0,0.0,0.0,1.0,0.0


In [74]:
end = datetime.now()
end - start

datetime.timedelta(seconds=1151, microseconds=544763)

### Как сохранить обученную модель:

In [75]:
col_list_num = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 
                'population', 'households', 'median_income', 'rooms_per_household', 
                'population_per_household', 'NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND']
col_list_y = ['median_house_value']

col_list_cat = ['ocean_proximity']

In [78]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_full[col_list_num].iloc[:15000,:], housing_full[col_list_y].iloc[:15000,:])
housing_predictions = svm_reg.predict(housing_full[col_list_num].iloc[15000:,:])
housing_labels = housing_full[col_list_y].iloc[15000:,:]
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

  y = column_or_1d(y, warn=True)


0.7186257711511951

In [79]:
# from sklearn.externals import joblib # deprecated
import joblib
joblib.dump(svm_reg, 'my_model.pkl')

['my_model.pkl']

In [80]:
my_model = joblib.load('my_model.pkl')

In [82]:
svm_mse_ = mean_squared_error(housing_labels, my_model.predict(housing_full[col_list_num].iloc[15000:,:]))
svm_rmse_ = np.sqrt(svm_mse_)
svm_rmse_

0.7186257711511951