In [139]:
import numpy as np
import pandas as pd

from scipy.stats import skew

from sklearn.linear_model import Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [140]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/train.csv')
# Concatenando dados de treino e teste para facilitar as operações
# de pré processamento.
all_data = pd.concat((train_data.loc[:,'MSSubClass':'SaleCondition'],
                  test_data.loc[:,'MSSubClass':'SaleCondition']))

In [21]:
# Funções interessantes
# all_data.head()
# all_data.tail()
# all_data.info()
# all_data.describe()

In [141]:
# Removing columns that the number of nans values is greater than 0.3 

col_upper_bound = 0.3
df = all_data
for column_name in df.columns:
    column = df[column_name]
    nan_percentage = column.isnull().sum()/column.size
    if(nan_percentage > upper_bound):
        df = df.drop(columns=[column_name])
all_data = df

In [136]:
all_data.shape

(2920, 73)

In [145]:
# Removing columns that the number of nans values is greater than 0.1 (more than 10 values null)
df = all_data
row_upper_bound = 0.05
row_size = df.shape[1]
for index, row in df.iterrows():
    rownan_percentage = row.isnull().sum()/row_size
    if (rownan_percentage >= row_upper_bound ):
        df = df.drop([index])

all_data = df

In [143]:
all_data.shape

(2698, 73)

In [137]:
# Aplicando Log nos preços de venda (valores em distribuição normal).
train_data["SalePrice"] = np.log1p(train_data["SalePrice"])

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Calcula o skewness
# For normally distributed data, the skewness should be about 0.
# A skewness value > 0 means that there is more weight in the left tail of the distribution.
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))

# Seleciona os valores com skewness > 0.75
skewed_feats = skewed_feats[skewed_feats > 0.75].index

# Aplicando log nos valores com skewness maior que 0.75
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# Converte dados categorigos em dummy indicators
all_data = pd.get_dummies(all_data)

# Preenche os valores em branco com a média.
all_data = all_data.fillna(all_data.mean())

# Dados para treinamento e teste após pré processamento.
train_x = all_data[:train_data.shape[0]]
test_x = all_data[train_data.shape[0]:]
train_y = train_data.SalePrice

In [144]:
def rmse_cv(model, x_train, y_train, k_folds=10):
    return np.sqrt(-cross_val_score(model, x_train, y_train,
        scoring="neg_mean_squared_error", cv=k_folds))

print('Lasso(alpha=0.0005):')
print(rmse_cv(Lasso(alpha=0.0005), train_x, train_y).mean())
model_lasso = Lasso(alpha=0.0005).fit(train_x, train_y)
print()
# Modelo XBG
print('XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):')
# print(rmse_cv(XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1), train_x, train_y).mean())


Lasso(alpha=0.0005):
0.12081504846821753

XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):


# Results
## Default

Lasso(alpha=0.0005):
0.1211865670995667

XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):
0.12249088341435015

## Removing nans values per col (upper_bound = 0.3)

Lasso(alpha=0.0005):
0.12082472393696742

XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):
0.12304465867881136

## Removing nans values per row (upper_boud = 0.05)
Lasso(alpha=0.0005):
0.12081504846821753

