In [None]:
import numpy as np
import pandas as pd
%config Completer.use_jedi = False
pd.set_option('max_columns',None)
pd.set_option('max_rows',90)


In [None]:
# import sklearn
# print(sklearn.__version__)
# !pip uninstall scikit-learn==1.0.2 -y
# !pip install --upgrade scikit-learn==0.23.2
# !pip install -q pycaret 

In [None]:
train0 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test0=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
sample_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [None]:
train0.head()

In [None]:
test0.head()

In [None]:
sample_submission.head()

# data cleaning


In [None]:
test0.isna().sum()

In [None]:
# Combine Train and test sets
target = train0['SalePrice']
test_ids = test0['Id']
train1 = train0.drop(['Id','SalePrice'],axis=1)
test1 = test0.drop('Id',axis=1)
data1=pd.concat([train1,test1],axis=0).reset_index(drop=True)
data1.head()

In [None]:
target

**Cleaning**

In [None]:
data1.select_dtypes(np.number)

In [None]:
data2 =data1.copy()

In [None]:
data2['MSSubClass']=data2['MSSubClass'].astype(str)

In [None]:
#Fill Categorical Missing Values

In [None]:
#Impute using the column mode
for column in [
    'Alley','BsmtQual',
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
    'BsmtFinType2','FireplaceQu','GarageType', 'GarageFinish',
    'GarageQual', 'GarageCond','PoolQC','Fence','MiscFeature',
    
]:
    data2[column] = data2[column].fillna('None')
#Impute using the constant value
for column in [
    'MSZoning','Utilities','Exterior1st','Exterior2nd',
    'MasVnrType','Electrical','KitchenQual','Functional',
    'SaleType'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])
    

In [None]:
data2.select_dtypes('object').loc[:,data2.isna().sum()>0].columns

In [None]:
data2['Electrical'].unique()

In [None]:
data2.select_dtypes(np.number).isna().sum()

In [None]:
data3 = data2.copy()

# Numerc missing value

In [None]:
from sklearn.neighbors import KNeighborsRegressor
data3.select_dtypes(np.number).isna().sum()

In [None]:
def knn_impute(df,na_target):
    df = df.copy()
    numeric_df = df.select_dtypes(np.number)
    non_na_columns =numeric_df.loc[:,numeric_df.isna().sum()==0].columns
    y_train = numeric_df.loc[numeric_df[na_target].isna()==False,na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna()==False,non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna()==True,non_na_columns]
    knn =KNeighborsRegressor()
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    df.loc[df[na_target].isna()==True,na_target] = y_pred
           
    return df

In [None]:

for colum in ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']:
    data3=knn_impute(data3,colum)

In [None]:
data4 =data3.copy()


# Feature Engineering

In [None]:
data4["SqFtPerRoom"] = data4["GrLivArea"] / (data4["TotRmsAbvGrd"] +
                                                       data4["FullBath"] +
                                                       data4["HalfBath"] +
                                                       data4["KitchenAbvGr"])

data4['Total_Home_Quality'] = data4['OverallQual'] + data4['OverallCond']

data4['Total_Bathrooms'] = (data4['FullBath'] + (0.5 * data4['HalfBath']) +
                               data4['BsmtFullBath'] + (0.5 * data4['BsmtHalfBath']))

data4["HighQualSF"] = data4["1stFlrSF"] + data4["2ndFlrSF"]

In [None]:
data5=data4.copy()

# Feature Transformation

In [None]:
import scipy.stats
skew_df = pd.DataFrame(data4.select_dtypes(np.number).columns,columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data4[feature]))
skew_df['Absolute'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute'].apply(lambda x: True if x >= 0.5 else False)
skew_df

In [None]:
data4[skew_df.query("Skewed == True")['Feature'].values].describe()


In [None]:
for col in skew_df.query("Skewed == True")['Feature'].values:
    data4[col] = np.log1p(data4[col])

In [None]:
import scipy.stats
skew_df = pd.DataFrame(data4.select_dtypes(np.number).columns,columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data4[feature]))
skew_df['Absolute'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute'].apply(lambda x: True if x >= 0.5 else False)
skew_df

In [None]:
data4['MoSold']=np.max(-np.cos(0.5235*data4['MoSold']))

In [None]:
data5 = data4.copy()

# Encoding Categorical

In [None]:
data5 = pd.get_dummies(data5)

# Data Scaling


In [None]:
from sklearn.preprocessing import StandardScaler
data6 = data5.copy()

In [None]:
scaler =  StandardScaler()
scaler.fit(data6)
data6= pd.DataFrame(scaler.transform(data6),index= data6.index,columns = data6.columns)

In [None]:
data6

# Target Transformation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
target.hist()

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
sns.distplot(target,kde=True,fit=scipy.stats.norm)
plt.title("Without log transform")
plt.subplot(1,2,2)
sns.distplot(np.log(target),kde=True,fit=scipy.stats.norm)
plt.xlabel('Log sale price')
plt.title('With log transform')
plt.show()

In [None]:
log_target = np.log(target)
log_target

# Model Selection

In [None]:
# from pycaret.regression import setup,compare_models
# !pip install pyyaml==5.4.1

In [None]:
train_final = data6.loc[:train0.index.max(),:]

In [None]:
test_final=data6.loc[train0.index.max()+1:,:].reset_index(drop=True)

In [None]:
data7=data6.copy()

In [None]:
#setup(data=pd.concat([train_final,log_target],axis = 1),target='SalePrice')

In [None]:
#compare_models()

# Import all the model just outputted by pycaret

In [None]:
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge,HuberRegressor,Ridge,OrthogonalMatchingPursuit
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score




## Baseline model

In [None]:
baseline_model = CatBoostRegressor(verbose=0)

In [None]:
baseline_model.fit(train_final,log_target)

# Hyperparameter optimisation

In [None]:
import optuna
def br_objective(trial):
    n_iter=trial.suggest_int('n_iter',50,600)
    tol=trial.suggest_loguniform('tol',1e-8,10)
    alpha_1=trial.suggest_loguniform('alpha_1',1e-8,10)
    alpha_2=trial.suggest_loguniform('alpha_2',1e-8,10)
    lambda_1=trial.suggest_loguniform('lambda_1',1e-8,10)
    lambda_2=trial.suggest_loguniform('lambda_2',1e-8,10)
    model = BayesianRidge(
    n_iter=n_iter,
    tol=tol,
    alpha_1=alpha_1,
    alpha_2=alpha_2,
    lambda_1=lambda_1,
        lambda_2=lambda_2,)
    model.fit(train_final,log_target)
    cv_score = np.exp(np.sqrt(-cross_val_score(model,train_final,log_target,scoring='neg_mean_squared_error',cv=kf)))
    return np.mean(cv_score)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(br_objective,n_trials=100)

In [None]:
study.best_params

# Bagging Ensemble

In [None]:
catboost_params = {
    'iterations': 6000,
    'learning_rate': 0.005,
    'depth': 4,
    'l2_leaf_reg': 1,
    'eval_metric':'RMSE',
    'early_stopping_rounds': 200,
    'random_seed': 42
}

br_params = {
    'n_iter': 323,
    'tol': 0.3899846038464359,
    'alpha_1': 0.000676617884149607,
    'alpha_2': 9.787731824756413,
    'lambda_1': 0.05406241772591694,
    'lambda_2': 3.7122435882708177e-07
}

lightgbm_params = {
    'num_leaves': 39,
    'max_depth': 2,
    'learning_rate': 0.13705339989856127,
    'n_estimators': 273
}

ridge_params = {
    'alpha': 631.1412445239156
}

In [None]:
models = {
    'catboost':CatBoostRegressor(**catboost_params,verbose=0),
    'br':BayesianRidge(**br_params),
    'lgbm':LGBMRegressor(**lightgbm_params),
    'ridge':Ridge(**ridge_params),
    'omp':OrthogonalMatchingPursuit()
}

In [None]:
for name,model in models.items():
    model.fit(train_final,log_target)
    print(name+' trained.')

## Evaluate

In [None]:
results = {}
kf = KFold(n_splits=10)
for name,model in models.items():
    result = np.exp(np.sqrt(-cross_val_score(model,train_final,log_target,scoring='neg_mean_squared_error',cv=kf)))
    results[name]=result

In [None]:
results


In [None]:
plt.figure(figsize = (16,10))
for name,model in models.items():
    sns.displot(results[name],kde=True,bins=10,label=name)
plt.title('CV error Distributions')
plt.show()

In [None]:
for name,result in results.items():
    print('___________\n'+name+'\n___________\n')
    print(np.mean(result))
    print(np.std(result))

In [None]:
#np.exp(np.sqrt(np.mean(-results)))

# Combining predictions

In [None]:
final_predictions =(
   0.4*np.exp(models['catboost'].predict(test_final))+
    0.2*np.exp(models['br'].predict(test_final))+
    0.2*np.exp(models['lgbm'].predict(test_final))+
    0.1*np.exp(models['ridge'].predict(test_final))+
    0.1*np.exp(models['omp'].predict(test_final))
    )


In [None]:
submission=pd.concat([test_ids,pd.Series(final_predictions,name='SalePrice')],axis=1)
submission

In [None]:
       submission.to_csv('./submission004.csv',index=False,header=True)     
#this got .12395

# Make Submissions

In [None]:
# Make Submissions

In [None]:
predictions=np.exp(baseline_model.predict(test_final))

In [None]:
submission=pd.concat([test_ids,pd.Series(predictions,name='SalePrice')],axis=1)
submission


In [None]:
submission.to_csv('./submission.csv',index=False,header=True)
#001: .12395
#002: .12489
#003: .1226
#004: .1211