### Обучение пайплайна

1. Загрузим данные https://www.kaggle.com/amaanafif/chennai-house-price
2. Соберем пайплайн с простейшим препроцессингом (tfidf) на текстовых данных
3. Обучим логистическую регрессию и сохраним на диск предобученный пайплайн

In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

#normalizing data
from sklearn.preprocessing import StandardScaler
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion

import sklearn.datasets



In [2]:
df = pd.read_csv('clean_data.csv')
df.head(5)

Unnamed: 0,price,area,status,bhk,bathroom,age,location,builder
0,37.49,872,Ready to move,2,,1.0,Sembakkam,MP Developers
1,93.54,1346,Under Construction,3,2.0,,Selaiyur,DAC Promoters
2,151.0,2225,Under Construction,3,,0.0,Mogappair,Casagrand Builder Private Limited
3,49.0,1028,Ready to move,2,2.0,3.0,Ambattur,Dugar Housing Builders
4,42.28,588,Under Construction,2,1.0,0.0,Pallavaram,Radiance Realty Developers India Ltd


In [3]:
df.describe()

Unnamed: 0,price,area,bhk,bathroom,age
count,2620.0,2620.0,2620.0,1403.0,1729.0
mean,93.834683,1282.925191,2.443893,2.35923,1.355119
std,113.609349,692.566319,0.811984,0.844951,2.102682
min,12.83,300.0,1.0,1.0,0.0
25%,42.0,877.0,2.0,2.0,0.0
50%,61.735,1091.5,2.0,2.0,0.0
75%,90.0,1471.25,3.0,3.0,3.0
max,1422.0,6700.0,8.0,7.0,32.0


In [4]:
df.dropna().shape

(580, 8)

In [5]:
df.dtypes

price       float64
area          int64
status       object
bhk           int64
bathroom    float64
age         float64
location     object
builder      object
dtype: object

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('price', 1), 
                                                    df['price'], test_size=0.25, random_state=42)
#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [7]:
X_train

Unnamed: 0,area,status,bhk,bathroom,age,location,builder
192,2228,Under Construction,4,,0.0,Mogappair,Casagrand Builder Private Limited
174,1390,Ready to move,3,,1.0,Kundrathur,MP Developers
1691,645,Under Construction,2,2.0,,Royapettah,seller
1521,1850,Ready to move,3,3.0,,Perumbakkam,smartassetsindia
1598,957,Ready to move,2,,1.0,Keelkattalai,Bala
...,...,...,...,...,...,...,...
1638,2650,Ready to move,5,5.0,,West Mambalam,seller
1095,1259,Under Construction,2,2.0,0.0,Sholinganallur,Prestige Estates Projects Ltd
1130,2280,Ready to move,4,3.0,5.0,Iyyappanthangal,seller
1294,1550,Ready to move,3,3.0,,Ayanavaram,MEHTA REAL ESTATE CHENNAI LLP


In [8]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


In [9]:
cat_cols = ['location', 'builder']   # 'status', 
base_cols = ['age',]  # 'bhk', 'bathroom', 
continuos_cols = ['area',]

continuos_transformers = []
cat_transformers = []
base_transformers = []

for cont_col in continuos_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', NumberSelector(key=cat_col)),
        #        ('ohe', OHEEncoder(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))
    
for base_col in base_cols:
    base_transformer = Pipeline([
                ('selector', NumberSelector(key=base_col))
            ])
    base_transformers.append((base_col, base_transformer))

In [10]:
feats = FeatureUnion(cat_transformers+continuos_transformers+base_transformers)
feature_processing = Pipeline([('feats', feats)])

x_train_ = feature_processing.fit_transform(X_train)
x_test_ = feature_processing.transform(X_test)

In [11]:
%%time
from catboost import CatBoostRegressor

frozen_params = {
#     'eval_metric': 'F1',
    'silent': True,
    'one_hot_max_size': 20,
    'early_stopping_rounds': 50,
    'boosting_type': 'Ordered',
    'allow_writing_files': False
}

pipeline = Pipeline([
    ('features',feats),
    ('classifier', CatBoostRegressor(random_state=12, **frozen_params, cat_features=list(range(2)))),
])

pipeline.fit(X_train, y_train)

CPU times: user 12.2 s, sys: 1.29 s, total: 13.5 s
Wall time: 3.22 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('location',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='location'))])),
                                                ('builder',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='builder'))])),
                                                ('area',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='area')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('age',
                                       

In [12]:
pipeline[1].get_feature_importance()

array([17.40419806, 11.06512202, 64.52812537,  7.00255455])

In [13]:
with open("catboost_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)