In [2]:
import numpy as np
import pandas as pd

from scipy.stats import skew

from sklearn.linear_model import Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [3]:
# Caminho dos dados.
DTRAIN_PATH = "data/train.csv"
DTEST_PATH = "data/test.csv"

# Funções auxiliares.
def reading_data(path):
    return pd.read_csv(path)

def rmse_cv(model, x_train, y_train, k_folds=10):
    return np.sqrt(-cross_val_score(model, x_train, y_train,
        scoring="neg_mean_squared_error", cv=k_folds))

def grid_search(model, params, train_x, train_y):
	# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
	# gridsearch = GridSearchCV(model, param_grid=params, scoring="neg_log_loss", n_jobs=-1, cv=kfold)

	gridsearch = GridSearchCV(model, param_grid=params, n_jobs=-1)
	gridresult = gridsearch.fit(train_x, train_y)

	return gridresult

def print_grid_result(name, grid_result):
	print(name + ":")
	# sumarize results
	print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
	means = grid_result.cv_results_['mean_test_score']
	stds = grid_result.cv_results_['std_test_score']
	params = grid_result.cv_results_['params']
	for mean, stdev, param in zip(means, stds, params):
		print("%f (%f) with: %r" % (mean, stdev, param))
	print("\n")


def grid_search_lasso(train_x, train_y):
	params = {'alpha':(1, 0.1, 0.001, 0.0005)}

	grid_result = grid_search(Lasso(), params, train_x, train_y)
	print_grid_result("Lasso", grid_result)

def grid_search_ridge(train_x, train_y):
	params = {'alpha':(1, 3, 5, 10, 15, 30, 50, 75)}

	grid_result = grid_search(Ridge(), params, train_x, train_y)
	print_grid_result("Ridge", grid_result)


In [117]:
# Funções interessantes
# all_data.head()
# all_data.tail()
# all_data.info()
# all_data.describe()

In [4]:
# Removing columns that the number of nans values is greater than bound=0.3 
def remove_nan_columns(df, bound=0.3):
    for column_name in df.columns:
        column = df[column_name]
        nan_percentage = column.isnull().sum()/column.size
        if(nan_percentage > bound):
            df = df.drop(columns=[column_name])
    return df

In [5]:
# Removing rows that the number of nans values is greater than bound=0.1 (more than 10 values null)
def remove_nan_rows(df, bound=0.1):
    row_size = train_data.shape[1]
    for index, row in df.iterrows():
        rownan_percentage = row.isnull().sum()/row_size
        if (rownan_percentage >= bound ):
            df = df.drop(index)
    return df

In [6]:
# Removing redundance using drop_duplicates and correlation
def remove_redundance(df, bound=0.0005):
    df = df.drop_duplicates()

    corr = df.corr().abs()
    corr = corr[corr!= 1]
    print(corr.min() < bound)
    remove = np.empty(0)
    columns = corr.columns
    for index, row in corr.iterrows():
        for col in columns:
            if (row[col] < bound):
                remove = np.append(remove, col)
    print(df.shape)
    df = df.drop(columns=np.unique(remove))
    print(df.shape)

    return df

In [7]:
def pre_processing_data(train_data, test_data):

    # Removendo as linhas com muitos dados faltantes do conjunto de treino.
    train_data = remove_nan_rows(train_data)

    # Concatenando dados de treino e teste para facilitar as operações
    # de pré processamento.
    all_data = pd.concat((train_data.loc[:,'MSSubClass':'SaleCondition'],
                      test_data.loc[:,'MSSubClass':'SaleCondition']))

    # Removendo as colunas com muitos dados ausentes.
    all_data = remove_nan_columns(all_data)

#   all_data = remove_redundance(all_data, bound=0.00001)
    
    # Aplicando Log nos preços de venda (valores em distribuição normal).
    train_data["SalePrice"] = np.log1p(train_data["SalePrice"])

    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

    # Calcula o skewness
    # For normally distributed data, the skewness should be about 0.
    # A skewness value > 0 means that there is more weight in the left tail of the distribution.
    skewed_feats = train_data[numeric_feats].apply(lambda x: skew(x.dropna()))

    # Seleciona os valores com skewness > 0.75
    skewed_feats = skewed_feats[skewed_feats > 0.75].index

    # Aplicando log nos valores com skewness maior que 0.75
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

    # Converte dados categorigos em dummy indicators
    all_data = pd.get_dummies(all_data)

    all_data.shape
    # Preenchendo os valores em branco com a média.
#     all_data = all_data.fillna(all_data.mean())
    # Preenchendo os valores em branco com a mediana.
    all_data = all_data.fillna(all_data.median())
    # Preenchendo os valores em branco com a moda.
#     all_data = all_data.fillna(all_data.mode().T)


    # Dados para treinamento e teste após pré processamento.
    x_train = all_data[:train_data.shape[0]]
    x_test = all_data[train_data.shape[0]:]
    y_train = train_data.SalePrice

    return (x_train, y_train, x_test)


train_data = reading_data(DTRAIN_PATH)
test_data = reading_data(DTEST_PATH)

train_x, train_y, test_x = pre_processing_data(train_data, test_data)


print('Lasso(alpha=0.0005):')
print(rmse_cv(Lasso(alpha=0.0005), train_x, train_y).mean())
# print(rmse_cv(Ridge(alpha=5), train_x, train_y).mean())
# grid_search_lasso(train_x, train_y)
# grid_search_ridge(train_x, train_y)
print()
# Modelo XBG
# print('XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):')
# print(rmse_cv(XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1), train_x, train_y).mean())


Lasso(alpha=0.0005):
0.1180499078769596



In [12]:
train_data = reading_data(DTRAIN_PATH)
test_data = reading_data(DTEST_PATH)

cp_train_data = train_data

train_x, train_y, test_x = pre_processing_data(train_data, test_data)

grid_search_lasso(train_x, train_y)



Lasso:
Best: 0.897062 using {'alpha': 0.0005}
0.376606 (0.012845) with: {'alpha': 1}
0.691365 (0.003917) with: {'alpha': 0.1}
0.894183 (0.019403) with: {'alpha': 0.001}
0.897062 (0.020630) with: {'alpha': 0.0005}




In [275]:
train_data = reading_data(DTRAIN_PATH)
test_data = reading_data(DTEST_PATH)

train_x, train_y, test_x = pre_processing_data(train_data, test_data)


print('Lasso(alpha=0.0005):')
print(rmse_cv(Lasso(alpha=0.0005), train_x, train_y).mean())
print(rmse_cv(Ridge(alpha=5), train_x, train_y).mean())
# grid_search_lasso(train_x, train_y)
# grid_search_ridge(train_x, train_y)
print()
# Modelo XBG
print('XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):')
print(rmse_cv(XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1), train_x, train_y).mean())


MSSubClass        True
LotFrontage      False
LotArea          False
OverallQual      False
OverallCond      False
YearBuilt         True
YearRemodAdd      True
MasVnrArea       False
BsmtFinSF1       False
BsmtFinSF2        True
BsmtUnfSF        False
TotalBsmtSF      False
1stFlrSF         False
2ndFlrSF         False
LowQualFinSF      True
GrLivArea        False
BsmtFullBath      True
BsmtHalfBath      True
FullBath         False
HalfBath          True
BedroomAbvGr      True
KitchenAbvGr      True
TotRmsAbvGrd     False
Fireplaces       False
GarageYrBlt      False
GarageCars       False
GarageArea       False
WoodDeckSF       False
OpenPorchSF      False
EnclosedPorch     True
3SsnPorch        False
ScreenPorch      False
PoolArea          True
MiscVal           True
MoSold            True
YrSold            True
dtype: bool
(2811, 74)
(2811, 59)
Lasso(alpha=0.0005):
0.11790325613246747
0.120754448103974

XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):
0.117324535425

In [38]:
df = train_data
nans_cols = {}

for column_name in df.columns:
    column = df[column_name]
    nans = column.isnull().sum()
    if(nans > 0):
        nans_cols[column_name] = nans

# [print(v) for v in nans_cols]
import operator
sorted(nans_cols.items(),  key=operator.itemgetter(1))



[('Electrical', 1),
 ('MasVnrType', 8),
 ('MasVnrArea', 8),
 ('BsmtQual', 37),
 ('BsmtCond', 37),
 ('BsmtFinType1', 37),
 ('BsmtExposure', 38),
 ('BsmtFinType2', 38),
 ('GarageType', 81),
 ('GarageYrBlt', 81),
 ('GarageFinish', 81),
 ('GarageQual', 81),
 ('GarageCond', 81),
 ('LotFrontage', 259),
 ('FireplaceQu', 690),
 ('Fence', 1179),
 ('Alley', 1369),
 ('MiscFeature', 1406),
 ('PoolQC', 1453)]

# Results
## Default

Lasso(alpha=0.0005):
0.1211865670995667

XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):
0.12249088341435015

## Removing nans values per col (upper_bound = 0.3)

Lasso(alpha=0.0005):
0.12082472393696742

XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):
0.12304465867881136

## Removing nans values per row and cols
Lasso(alpha=0.0005):
0.11805243912299515

## Using Median
Lasso(alpha=0.0005):
0.1180499078769596

XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1):
0.116734983343264

## Using Mode
