In [1]:
# some imports
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rc('font', size=12) 
plt.rc('figure', figsize = (12, 5))

# Settings for the visualizations
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2,'font.family': [u'times']})

import pandas as pd
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# create output folder
if not os.path.exists('output'):
    os.makedirs('output')
if not os.path.exists('output/session1'):
    os.makedirs('output/session1')
    
    
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import neighbors
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures

import time
## lets comare all of them 

def display_scores(scores,model_name = None):
    if(model_name):
        print("----",model_name,"----")
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def plot_results(results,param_grid,variable_name):
        #plot the results
    plt.figure(figsize=(7, 7))
    plt.title("GridSearchCV",
              fontsize=16)

    values = param_grid[variable_name]
    
    min_v = min(values)
    max_v = max(values)
    
    
    plt.xlabel(variable_name)
    plt.ylabel("Score")

    ax = plt.gca()
    #ax.set_xlim(min_v, max_v)


    # Get the regular numpy array from the MaskedArray
    X_axis = np.array(results['param_'+variable_name].data, dtype=float)


    for sample, style in (('train', '--'), ('test', '-')):
        sample_score_mean = (-results['mean_%s_score' % (sample)])
        sample_score_std = (results['std_%s_score' % (sample)])
        ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                        sample_score_mean + sample_score_std,
                        alpha=0.1 if sample == 'test' else 0)
        ax.plot(X_axis, sample_score_mean, style,
                alpha=1 if sample == 'test' else 0.7,
                label="(%s)" % ( sample))

    best_index = np.nonzero(results['rank_test_score' ] == 1)[0][0]
    best_score =  (-results['mean_test_score' ][best_index])

    # Plot a dotted vertical line at the best score for that scorer marked by x
    ax.plot([X_axis[best_index], ] * 2, [best_score, best_score],
            linestyle='-.',  marker='x', markeredgewidth=3, ms=8)

    # Annotate the best score for that scorer
    ax.annotate("%0.2f" % best_score,
                (X_axis[best_index], best_score + 0.005))

    plt.legend(loc="best")
    plt.grid(False)
    plt.show()




In [2]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [3]:
housing = pd.read_csv('dataset/housing-snapshot/train_set.csv',index_col=0) 
test_housing = pd.read_csv('dataset/housing-snapshot/test_set.csv',index_col=0) 
housing['Postcode'] = pd.Categorical(housing.Postcode)

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)
len(train_set),len(test_set)

## Create a function that divides the data with an id
## checks that id is not train and test set
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

housing_with_id = housing.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

## create an id base on latitude and longitude
housing_with_id["id"] = housing["Longtitude"] * 1000 + housing["Lattitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

## divide using the scikit learn function
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)


housing["price_cat"] = pd.cut(housing["Price"],
                               bins=[0., 500000, 1000000, 1500000, 2000000., np.inf],
                               labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["price_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
def price_cat_proportions(data):
    return data["price_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": price_cat_proportions(housing),
    "Stratified": price_cat_proportions(strat_test_set),
    "Random": price_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

for set_ in (strat_train_set, strat_test_set):
    set_.drop("price_cat", axis=1, inplace=True)

## Para cuando quiero el dataset nuevo

In [4]:
X_train = strat_train_set.drop("Price", axis=1).copy()
y_train = strat_train_set["Price"].copy()

housing = strat_train_set.drop("Price", axis=1).copy()
housing_labels = strat_train_set["Price"].copy()

## Pipe del proyecto base

In [5]:


housing_num = housing.select_dtypes(include=[np.number])

## create a function to replace 0 by NaN
def replace_0_2_NaN(data):
    data[data == 0] = np.nan
    return data


# column index
Rooms_ix, Bedroom2_ix, Bathroom_ix, BuildingArea_ix = 0, 2, 3, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        rooms_per_building_area = X[:, Rooms_ix] / (1.0 +X[:, BuildingArea_ix])# add 1 to avoid 0 division
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, Bedroom2_ix] / (1.0 + X[:, Bathroom_ix]) # add 1 to avoid 0 division
            return np.c_[X, rooms_per_building_area, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_building_area]

num0_pipeline = Pipeline([
        ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
        ('imputer', SimpleImputer(strategy="median")),
        ('log',FunctionTransformer(np.log1p, validate=True)),
        ('std_scaler', StandardScaler()),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ])

num_attribs0 = ['Landsize','BuildingArea']
num_attribs1 = list(housing_num)
cat_attribs = ["CouncilArea",'Type','Suburb','Postcode']


full_pipeline = ColumnTransformer([
        ("num0", num0_pipeline, num_attribs0),
        ("num1", num_pipeline, num_attribs1),
        ("cat", cat_pipeline, cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing,housing_labels)
housing_prepared

<4345x517 sparse matrix of type '<class 'numpy.float64'>'
	with 82555 stored elements in Compressed Sparse Row format>

## Diferentes modelos de base

In [6]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)
n_neighbors = 3
knn_reg = neighbors.KNeighborsRegressor(n_neighbors)
knn_reg.fit(housing_prepared, housing_labels)
forest_reg = RandomForestRegressor(n_estimators=20, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)
svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
models = [(lin_reg,"lin_reg")]
for model in models:
    scores = cross_val_score(model[0], housing_prepared, housing_labels, scoring="neg_root_mean_squared_error", cv=5, n_jobs=-1)
    display_scores(-scores, model[1])

---- lin_reg ----
Mean: 376591.4243453588
Standard deviation: 39181.97082275836


## Mi pipeline

In [7]:


class DividedAtributes(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        proportion1 = X[:, 0] / (1.0 +X[:, 1])# add 1 to avoid 0 division
        proportion2 = np.ones(proportion1.shape, dtype=np.float)/proportion1
        return np.c_[proportion1,proportion1]

    
class invert_variable(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        data=np.zeros(X.shape,dtype=np.float)
        for i in range(X.shape[1]):
            data[:, 0]=1/X[:, 0]
        return np.c_[X,data]
class Categorical_2_mean(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        self.medias_por_columnas=[]
        self.media_y=0
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        self.media_y = y.mean()
        self.medias_por_columnas=[]
        #Para cada columna
        for i in range(X.shape[1]):
            medias = {}
            columna = X[:,i]
            unicos = np.unique(columna)
            #Para cada valor guardo la media
            for u in unicos:
                medias[u]=y[columna==u].mean()
                
            self.medias_por_columnas.append(medias)
            
        return self  # nothing else to do
    
    def transform(self, X):
        data = np.ones((X.shape[0]))
        
        for i in range(X.shape[1]):
            columna = X[:,i]
            media = self.medias_por_columnas[i]
            nueva_columna = np.zeros(X.shape[0])+self.media_y
            
            #Para cada valor distinto dentro de la categoría
            for k in media.keys():
                nueva_columna[columna == k] = media[k]
            
            #Si es la primera vez
            if i == 0:
                data = nueva_columna
            else:
                data = np.c_[data,nueva_columna]
        return data

class Clean_Outlayers_Quantile(BaseEstimator, TransformerMixin):
    def __init__(self,q=0.01): # no *args or **kargs
        self.q=q
        self.low_q_col=[]
        self.high_q_col=[]
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        #Para cada columna
        for i in range(X.shape[1]):
            columna = X[:,i]
            self.high_q_col.append(np.quantile(a=columna , q=1-self.q))
            self.low_q_col.append(np.quantile(a=columna ,q=self.q))
        return self  # nothing else to do
    
    def transform(self, X):
        for i in range(X.shape[1]):
            q_high=self.high_q_col[i]
            q_low =self.low_q_col[i]
            columna = X[:,i]
            columna[columna>q_high]=q_high
            columna[columna<q_low]=q_low
        return X

# Primero el preprocesamiento
rf_pipe_num0 = Pipeline([
    ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
    ("fill_nan",SimpleImputer(strategy="mean")),
    ("clean_outlayer",Clean_Outlayers_Quantile()),
    ('log',FunctionTransformer(np.log1p, validate=True)),
    ("std",StandardScaler()),
])

rf_pipe_num1 = Pipeline([
    ("fill_nan",SimpleImputer(strategy="mean")),
    ("clean_outlayer",Clean_Outlayers_Quantile()),
    ("std",StandardScaler()),
])

rf_pipe_poli = Pipeline([
    ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
    ("fill_nan",SimpleImputer(strategy="mean")),
    ('zeros2NaN_2',FunctionTransformer(func = replace_0_2_NaN,validate=False)),# por si acaso hay algun 0
    #("divided",DividedAtributes()),
    ("invert_1/var",invert_variable()),
    ("poly_interact_2",PolynomialFeatures(interaction_only=True,degree=2)),
    ("std",StandardScaler()),
])

rf_cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

#cat2mean = Categorical_2_mean(minim_instances=10)
rf_cat2mean_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('cat2mean', Categorical_2_mean()),
        ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
        ("std",StandardScaler()),
])
"""
NUMERICAL:    ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
CATEGORICAL:  ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']
"""
num_attribs0 = ['Landsize','BuildingArea']
num_attribs1 = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car',
'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']

num_attribs_poli = ['Rooms','BuildingArea']
#list(housing_num)
cat_attribs = []#'Type',"CouncilArea",'Suburb','Regionname','Postcode']

rf_full_pipe = ColumnTransformer([
    ("num0", rf_pipe_num0, num_attribs0),
    ("num1", rf_pipe_num1, num_attribs1),
    ("poli", rf_pipe_poli, num_attribs_poli),
    ("cat",  rf_cat_pipe, cat_attribs),
    ("cat_2_mean",  rf_cat2mean_pipe, cat_attribs)
])

X_train_prepared = rf_full_pipe.fit_transform(X_train,y_train)

X_test_prepared = rf_full_pipe.transform(test_housing)
print("Prepared: ",X_train_prepared.shape)

Prepared:  (4345, 22)


In [8]:
scores1 = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_root_mean_squared_error", cv=10,n_jobs=-1)

In [9]:
housing_prepared_mio = X_train_prepared

scores2 = cross_val_score(tree_reg, housing_prepared_mio, housing_labels,
                         scoring="neg_root_mean_squared_error", cv=10,n_jobs=-1)
print("El base: ")
display_scores(-scores1)
print("El mío: ")
display_scores(-scores2)

El base: 
Mean: 417320.9395217064
Standard deviation: 31111.63494074384
El mío: 
Mean: 443484.79720145726
Standard deviation: 42876.75489063647


In [10]:
"""
El base: 
Mean: 417320.9395217064
Standard deviation: 31111.63494074384
El mío: 
Mean: 403739.0195635212
Standard deviation: 40121.749009224426
"""

'\nEl base: \nMean: 417320.9395217064\nStandard deviation: 31111.63494074384\nEl mío: \nMean: 403739.0195635212\nStandard deviation: 40121.749009224426\n'

In [11]:
housing_prepared = full_pipeline.fit_transform(housing,housing_labels)

In [12]:

"""
NUMERICAL:    ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
CATEGORICAL:  ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']
"""
num_log_att = ['Landsize','BuildingArea']
num_att = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car',
'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']

num_poli_att = ['Rooms','BuildingArea','Bathroom']




#list(housing_num)
cat_att = ['Type',"CouncilArea",'Suburb','Regionname','Postcode']
cat2mean_att = ['Type',"CouncilArea",'Suburb','Regionname','Postcode']

initial_atr = {
    "num_log_att":num_log_att.copy() ,
    "num_att":num_att.copy() ,
    "num_poli_att":num_poli_att.copy() ,
    "cat_att":cat_att.copy() ,
    "cat2mean_att":cat2mean_att.copy() ,
}

initial_atr = {'num_log_att': ['Landsize', 'BuildingArea'],
               'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'],
               'num_poli_att': ['Rooms', 'BuildingArea'], 
               'cat_att': ['Type', 'Regionname', 'Postcode'], 
               'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']
              }

initial_full_pipe = ColumnTransformer([
            ("num0", rf_pipe_num0, initial_atr["num_log_att"]),
            ("num1", rf_pipe_num1, initial_atr["num_att"]),
            ("poli", rf_pipe_poli, initial_atr["num_poli_att"]),
            ("cat",  rf_cat_pipe, initial_atr["cat_att"]),
            ("cat_2_mean",rf_cat2mean_pipe, initial_atr["cat2mean_att"])
])


#model = RandomForestRegressor(random_state=42,n_estimators=30,max_features="sqrt",max_depth=20)
model = LinearRegression()

start = time.time()

X_train_initial = initial_full_pipe.fit_transform(X_train,y_train)
previous_score_winner = -cross_val_score(model, X_train_initial, housing_labels,scoring="neg_root_mean_squared_error", cv=5).mean()

actual_list_atts = []    
for k in initial_atr.keys():
    for v in initial_atr[k]:
        actual_list_atts.append((k,v))


def create_att_dic(list_att):
    dic_att={}
    for key in initial_atr.keys():
        dic_att[key]=[]
    for (k,v) in list_att:
        dic_att[k].append(v)
    return dic_att
        

buscando = True


cont=0
while buscando and len(actual_list_atts)!=0:
    
    end = time.time()
    tiempo=end - start
    start = end
    
    dic_=create_att_dic(actual_list_atts)
    print(cont,". Score: ",previous_score_winner)
    print("Tiempo (s) :" ,tiempo )
    print(len(dic_)," Att. ",previous_score_winner, dic_)

    scores_ronda=[]
    for at in actual_list_atts:
        actual_list_without_at = actual_list_atts.copy()
        actual_list_without_at.remove(at)
        #Aqui hemos quitado la feature
        dict_att=create_att_dic(actual_list_without_at)
        #Creamos la nueva pipe sin dicho atributo
        actual_full_pipe = ColumnTransformer([
            ("num0", rf_pipe_num0, dict_att["num_log_att"]),
            ("num1", rf_pipe_num1, dict_att["num_att"]),
            ("poli", rf_pipe_poli, dict_att["num_poli_att"]),
            ("cat",  rf_cat_pipe, dict_att["cat_att"]),
            ("cat_2_mean",rf_cat2mean_pipe, dict_att["cat2mean_att"])
        ])
        #Procesamos los datos con la nueva pipe
        X_train_actual = actual_full_pipe.fit_transform(X_train,y_train)
        #Calculamos con el modelo y guardamos como de bueno es
        actual_score = - cross_val_score(model, X_train_actual, housing_labels,scoring="neg_root_mean_squared_error", cv=5, n_jobs=-1).mean()
        scores_ronda.append((actual_score,at))
    
    winner = min(scores_ronda, key= lambda x : x[0])
    #Aquí tenemos el mejor score y el atributo que hay que quitar para tenerlo
    score_winner, att_winner = winner
    
    #Si este resultado es mejor que el mejor de la ronda anterior
    if score_winner < previous_score_winner:
        actual_list_atts.remove(att_winner)
        previous_score_winner = score_winner
        
    #Si no es mejor hemos acabado
    else:
        buscando=False
    
final_att = create_att_dic(actual_list_atts)
print(final_att)


    




0 . Score:  365278.50828061753
Tiempo (s) : 0.663611888885498
5  Att.  365278.50828061753 {'num_log_att': ['Landsize', 'BuildingArea'], 'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'], 'num_poli_att': ['Rooms', 'BuildingArea'], 'cat_att': ['Type', 'Regionname', 'Postcode'], 'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']}
{'num_log_att': ['Landsize', 'BuildingArea'], 'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'], 'num_poli_att': ['Rooms', 'BuildingArea'], 'cat_att': ['Type', 'Regionname', 'Postcode'], 'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']}
