# Загрузка библиотек и файлов

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb, LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm, boxcox_normmax
from scipy.special import boxcox1p

# Misc
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, scale, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

# Пропущенные данные

In [None]:
#функция для нахождения переменных с пропущенными значениями с указанием на их тип
def show_missing(df):
    missing = df.columns[df.isnull().any()].tolist()
    return df[missing].info()

In [None]:
#для измененний переменных совместим тестовую и обучающую выборку 
train_labels = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
all_features.shape

In [None]:
all_features['SqrtLotArea']=np.sqrt(all_features['LotArea'])
sns.pairplot(all_features, vars=["LotFrontage", "SqrtLotArea"])
cond = all_features['LotFrontage'].isnull()
all_features.LotFrontage[cond]=all_features.SqrtLotArea[cond]

In [None]:
#Функции для работы с категориальными переменными
def cat_exploration(df,column):
    return df[column].value_counts()
# Imputing the missing values
def cat_imputation(df, column, value):
    df.loc[df[column].isnull(),column] = value

In [None]:
#Alley
cat_exploration( all_features, 'Alley')
cat_imputation(all_features, 'Alley','None')

In [None]:
cat_imputation(all_features, 'MasVnrType', 'None')
cat_imputation(all_features, 'MasVnrArea', 0.0)

In [None]:
basement_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2']
all_features[basement_cols][all_features['BsmtQual'].isnull()==True]
for cols in basement_cols:
    if 'FinSF'not in cols:
        cat_imputation(all_features, cols,'None')

In [None]:
cat_imputation(all_features,'Electrical','SBrkr') #most frequent
cat_imputation(all_features, 'FireplaceQu','None')
pd.crosstab(all_features.Fireplaces, all_features.FireplaceQu)

In [None]:
garage_cols=['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']
all_features[garage_cols][all_features['GarageType'].isnull()==True]
for cols in garage_cols:
    if all_features[cols].dtype==np.object:
        cat_imputation(all_features, cols,'None')
    else:
        cat_imputation(all_features, cols, 0)

In [None]:
cat_imputation(all_features, 'PoolQC', 'None')
cat_imputation(all_features, 'Fence', 'None')
cat_imputation( all_features,'MiscFeature', 'None')

In [None]:
#all_data.var.value_counts()

In [None]:
cat_imputation(all_features, 'MSZoning', 'RL')
cat_imputation(all_features, 'Utilities', 'AllPub')
cat_imputation(all_features, 'Exterior1st', 'VinylSd')
cat_imputation(all_features, 'Exterior2nd', 'VinylSd')
cat_imputation(all_features, 'KitchenQual', 'TA')
cat_imputation(all_features, 'SaleType', 'WD')
cat_imputation(all_features, 'Functional', 'Typ')


In [None]:
#all_data.TotalBsmtSF.describe()

In [None]:
all_features['BsmtFinSF1'].fillna(441, inplace=True)
all_features['BsmtFinSF2'].fillna(50, inplace=True)
all_features['BsmtUnfSF'].fillna(560, inplace=True)
all_features['BsmtFullBath'].fillna(0, inplace=True)
all_features['BsmtHalfBath'].fillna(0, inplace=True)
all_features['TotalBsmtSF'].fillna(1051, inplace=True)



In [None]:
show_missing(all_features)

Пропущенные данные были заменены  0 или None, в случаях когда Nan обозначало отсуствие, средним значением, когда число пропущенных наблюдений составляло единицы, либо заменялось значением из других связанных переменных. 

In [None]:
#Добавим несколько важных переменных
all_features['TotalSF'] = all_features['TotalBsmtSF'] + all_features['1stFlrSF'] + all_features['2ndFlrSF']
all_features['haspool'] = all_features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['has2ndfloor'] = all_features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasgarage'] = all_features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasbsmt'] = all_features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasfireplace'] = all_features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

> # Распределение целевой переменной - Price 

In [None]:
from scipy import stats
from scipy.stats import norm, skew

sns.distplot(train['SalePrice'], fit=norm);


# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

In [None]:
#Распределение скошено направо, произведем изменения, чтобы распределение было нормальным
train["SalePrice"] = np.log1p(train["SalePrice"])

#График нового распределения
sns.distplot(train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

# Ищем переменные с сильно скошенным распределением (skewed variables)

In [None]:
#Берем только количественные переменные
numeric_feats = all_features.dtypes[all_features.dtypes != "object"].index

#Проверяем распределение переменных на скошенность 
skewed_feats = all_features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

Для сильно скошенных переменных применим box cox transformation. 

For each variable, a Box Cox transformation estimates the value lambda from -5 to 5 that maximizes the normality of the data. For negative values of lambda, the transformation performs a variant of the reciprocal of the variable. At a lambda of zero, the variable is log transformed, and for positive lambda values, the variable is transformed the power of lambda."

In [None]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_features[feat] = boxcox1p(all_features[feat], lam)
    
#all_data[skewed_features] = np.log1p(all_data[skewed_features])

In [None]:
#Избавляемся от категориальных переменных
all_features = pd.get_dummies(all_features).reset_index(drop=True)
all_features.shape

In [None]:
#Снова разделяем наши данные на тестовую и обучающую выборку
y = train['SalePrice'].reset_index(drop=True)
X = all_features.iloc[:len(y), :]
X_sub = all_features.iloc[len(y):, :]
X.shape, y.shape, X_sub.shape

# Построение моделей

In [None]:
#Бдуем использовать кросс валидацию с 10 фолдами
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

#Функции для подсчета ошибки 
def rmsle(y, y_pred): 
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [None]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

# Далее обозначаем модели и их гиперпараметры

In [None]:
#Так как регрессии не устойчивы к выбросам, то применим к ним RobustScaler()
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [None]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)

In [None]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

In [None]:
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [None]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

Считаем средние ошибки моделей 

In [None]:
score = cv_rmse(ridge)
score = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = cv_rmse(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)

print('Lasso')
lasso_model_full_data = lasso.fit(X, y)

print('Ridge')
ridge_model_full_data = ridge.fit(X, y)

print('Svr')
svr_model_full_data = svr.fit(X, y)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)

print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

Ансамль из моделей будет переобучаться, поэтому используем блендинг. Основываясь на качетсве моделей придаем их предсказаниям веса.Моделям с меньшей ошибкой присваиваем больший вес. Сумма весов = 1 , чтобы в итоговой модели не было отклонений.

In [None]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [None]:
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))
#Видно что блендинг моделей дал намного лучший результат, чем каждая модель по отдельности

# Сохраняем результат

In [None]:
submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission.shape

In [None]:
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict(X_sub)))

In [None]:
submission.to_csv("submission_1.csv", index=False)