In [None]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.externals import joblib


import load_transform_pipeline

## Text pipeline

In [None]:
text_cols = ["name","summary","space","description","neighborhood_overview","notes","transit","access",
             "interaction","house_rules","host_about"]

stopwords_en = stopwords.words('english')
textVectSettings = {
    'stop_words': stopwords_en,
    'max_df': 0.95,
    'min_df': .05,
    'ngram_range': (1,2),
    'max_features': 300
}

#Pasos para el pipeline Textos
text_pipeline = Pipeline(steps = [
    ( 'text_selector', load_transform_pipeline.FeatureSelector(text_cols) ),
    ( 'text_transformer', load_transform_pipeline.TextTransformer() ),
    ( 'text_vectorize',  load_transform_pipeline.custom_Tfidf(textVectSettings, {'fitSample':1}))
] )

## Dummy explotable pipeline

In [None]:
dummy_explotable_cols = ["host_verifications"]

dummy_explotable_pipeline = Pipeline(steps = [
    ('dummy_selector', load_transform_pipeline.FeatureSelector(dummy_explotable_cols)),
    ('dummy_transformer', load_transform_pipeline.Dummy_explotable_transformer()),
    ('dummy_transformer2', load_transform_pipeline.Dummy_explotable_transformer_2()),
    ( 'std_scaler', load_transform_pipeline.StandardScaler() )
])

## Dummy pipeline

In [None]:
dummy_cols = ["instant_bookable","is_business_travel_ready","cancellation_policy",
"require_guest_phone_verification",
"require_guest_profile_picture","host_response_time",
"host_is_superhost","host_has_profile_pic","host_identity_verified",
"city","state","property_type","room_type","bed_type", "host_verifications"];

dummy_pipeline = Pipeline(steps = [
    ('dummy_selector', load_transform_pipeline.FeatureSelector(dummy_cols)),
    ('imputer', load_transform_pipeline.SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', load_transform_pipeline.OneHotEncoder(handle_unknown='ignore'))
])

## Numeric pipeline

In [None]:
continuos_cols = [
    "reviews_per_month","host_response_rate","host_acceptance_rate",
    "review_scores_communication","review_scores_location","review_scores_value",
    "number_of_reviews_ltm","review_scores_rating","review_scores_cleanliness",
    "review_scores_checkin","availability_30","availability_60","availability_90","availability_365",
    "review_scores_accuracy","minimum_nights","maximum_nights",
    "calculated_host_listings_count","calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms","calculated_host_listings_count_shared_rooms",
    "host_listings_count","number_of_reviews",
    "accommodates","bathrooms","bedrooms","beds","guests_included",
]

numerical_pipeline = Pipeline( steps = [
    ( 'num_selector', load_transform_pipeline.FeatureSelector(continuos_cols) ),
    ( 'num_imputer', load_transform_pipeline.NumericalImputer(default_strategy = 'median') ),
    ( 'num_transformer', load_transform_pipeline.NumericalTransformer() ),
    ( 'std_scaler', load_transform_pipeline.StandardScaler() ) 
])

In [None]:
full_pipeline = FeatureUnion(
    transformer_list = [ 
        ('numerical_pipeline', numerical_pipeline ),
        ('dummy_pipeline', dummy_pipeline ),
        ('text_pipeline', text_pipeline ),
        ('dummy_explotable', dummy_explotable_pipeline)
    ],
    n_jobs=-1
)

filename = '../models/transformDataPipeline__not_fitted.pkl'

import pickle
pickle.dump(full_pipeline, open(filename, 'wb'))
# joblib.dump()