<a href="https://www.kaggle.com/code/mnik55/housing-prediction?scriptVersionId=202977937" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# description = "/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt"
# with open(description, 'r') as f:
#     content = f.read()
#     print(content)

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 105)
pd.set_option('display.max_rows', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
train_csv = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
test_csv = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
sns.displot(train_df['SalePrice'], kde=True)

In [None]:
train_df['SalePrice'].skew()

In [None]:
sns.displot(np.log1p(train_df['SalePrice']), kde=True)

In [None]:
np.log1p(train_df['SalePrice']).skew()

In [None]:
numerical_cols = train_df.dtypes[train_df.dtypes != 'object'].index.to_list()
categorical_cols = train_df.dtypes[train_df.dtypes == 'object'].index.to_list()

In [None]:
numerical_cols.remove('SalePrice')
numerical_cols.remove('Id')
print(numerical_cols)
print(categorical_cols)

In [None]:
train_df.isna().sum().sort_values(ascending=False)

In [None]:
# columns where NaN values have meaning e.g. no pool etc.
cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']

for col in cols_fillna:
    train_df.loc[:, col] = train_df[col].fillna('None')
    test_df.loc[:, col] = test_df[col].fillna('None')

In [None]:
from math import ceil
nc = 3
nr = ceil(len(numerical_cols)/3)
fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(18,6*nr))

axes = axes.flatten()

for i, col in enumerate(numerical_cols):
        sns.scatterplot(x=col, y='SalePrice', data=train_df, ax=axes[i])

plt.tight_layout()
plt.show()

In [None]:
from math import ceil
nc = 3
nr = ceil(len(numerical_cols)/3)
fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(18,6*nr))

axes = axes.flatten()

for i, col in enumerate(numerical_cols):
        sns.regplot(x=col, y='SalePrice', data=train_df, ax=axes[i])

plt.tight_layout()
plt.show()

In [None]:
mean_grl = train_df['GrLivArea'].mean()
std_grl = train_df['GrLivArea'].std()

lower_cap = mean_grl - 3*std_grl
upper_cap = mean_grl + 4*std_grl

train_df.loc[train_df['GrLivArea']>upper_cap, 'GrLivArea'] = upper_cap

In [None]:
sns.regplot(x='GrLivArea', y='SalePrice', data=train_df)

In [None]:
train_df.loc[:, numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].mean())

In [None]:
test_df.loc[:, numerical_cols] = test_df[numerical_cols].fillna(train_df[numerical_cols].mean())

In [None]:
test_df.isna().sum().sort_values(ascending=False)

In [None]:
train_df[categorical_cols].isna().sum().sum()

In [None]:
test_df[categorical_cols].isna().sum().sum()

In [None]:
# train_df.loc[:, 'SalePrice'] = np.log1p(train_df['SalePrice'])

In [None]:
# train_df.loc[:, 'GrLivArea'] = np.log1p(train_df[])

In [None]:
nc = 3
nr = ceil(len(categorical_cols)/3)

fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(18, 6*nr))

axes = axes.flatten()

for i, col in enumerate(categorical_cols):
    sns.boxplot(x=col, y='SalePrice', data=train_df, ax=axes[i])

plt.tight_layout()
plt.show()


In [None]:
nc = 3
nr = ceil(len(categorical_cols)/3)

fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(18, 6*nr))

axes = axes.flatten()

for i, col in enumerate(categorical_cols):
    sns.pointplot(x=col, y='SalePrice', data=train_df, ax=axes[i])

plt.tight_layout()
plt.show()


In [None]:
# raise SystemExit()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.sample(3)

In [None]:
train_df.info()

In [None]:
sns.heatmap(train_df.corr(numeric_only=True))

In [None]:
train_null = train_df.isna().sum()
1460-train_null[train_null>500]

In [None]:
train_df.columns.values

In [None]:
test_null = test_df.isna().sum()
1459-test_null[test_null>500]

In [None]:
test_null_col = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu', 'MasVnrType']
train_null_col = ['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu', 'MasVnrType']

In [None]:
train_corr = train_df.corr(numeric_only=True)['SalePrice']
train_high_corr = train_corr[(train_corr > 0.01) | (train_corr < -0.01)]
train_high_corr.sort_values()

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train_df)

In [None]:
sns.violinplot(x='MSZoning', y='SalePrice', data=train_df)

In [None]:
sns.kdeplot(x='SalePrice', data=train_df, hue='MSZoning')

In [None]:
sns.boxplot(x='MSZoning', y='SalePrice', data=train_df)

In [None]:
sns.countplot(x='MSZoning', data=train_df)

In [None]:
sns.boxplot(x='LotShape', y='SalePrice', data=train_df)

In [None]:
sns.countplot(x='LotShape', data=train_df)

In [None]:
sns.countplot(x='LandContour', data=train_df)

In [None]:
sns.countplot(x='Utilities', data=train_df)

In [None]:
sns.countplot(x='LandSlope', data=train_df)

In [None]:
sns.countplot(x='BldgType', data=train_df)

In [None]:
sns.boxplot(x='BldgType', y='SalePrice', data=train_df)

In [None]:
sns.boxplot(x='Foundation', y='SalePrice', data=train_df)

In [None]:
sns.countplot(x='Foundation', data=train_df)

In [None]:
sns.boxplot(x='GarageCond', y='SalePrice', data=train_df)

In [None]:
sns.countplot(x='GarageCond', data=train_df)

In [None]:
sns.countplot(x='Fence', data=train_df)

In [None]:
sns.boxplot(x='Fence', y='SalePrice', data=train_df)

In [None]:
sns.boxplot(x='KitchenQual', y='SalePrice', data=train_df)

In [None]:
sns.countplot(x='KitchenQual', data=train_df)

In [None]:
from sklearn.preprocessing import FunctionTransformer
log_trf = FunctionTransformer(np.log1p)
saleprice = log_trf.transform(train_df['SalePrice'])

In [None]:
sns.kdeplot(x=saleprice)
plt.show()
sns.kdeplot(x='SalePrice', data=train_df)

In [None]:
train_df.loc[:, 'SalePrice'] = saleprice.astype(float)

In [None]:
sns.kdeplot(x='GrLivArea', data=train_df)

In [None]:
# train_imp_cols = ['MSZoning', 'KitchenQual', 'LotShape', 'Foundation'] + train_high_corr.index.tolist()
train_imp_cols = categorical_cols + train_high_corr.index.tolist()
if 'Id' in train_imp_cols:
    train_imp_cols.remove('Id')
train_imp_cols

In [None]:
train_df_new = train_df[train_imp_cols]

In [None]:
train_df_new.isna().sum()

In [None]:
# train_df_new.loc[:, 'LotFrontage'] = train_df_new['LotFrontage'].fillna(train_df_new['LotFrontage'].mean())
train_df_new.loc[:, 'GarageYrBlt'] = train_df_new['GarageYrBlt'].fillna(train_df_new['GarageYrBlt'].mode().to_list()[0])
# train_df_new.loc[:, 'MasVnrArea'] = train_df_new['MasVnrArea'].fillna(train_df_new['MasVnrArea'].mean())

In [None]:
imputer = SimpleImputer(strategy='mean')
imputer_cols = ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'GarageArea', 'GarageCars', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtUnfSF', 'BsmtHalfBath', 'BsmtFinSF2']
train_df_new.loc[:, imputer_cols] = pd.DataFrame(imputer.fit_transform(train_df_new[imputer_cols]), columns=imputer_cols)

In [None]:
train_df_new.isna().sum().sum()

In [None]:
test_imp_cols = ['Id'] + train_imp_cols[:-1]
test_df_new = test_df[test_imp_cols]

In [None]:
# test_df_new.loc[:, 'TotalBsmtSF'] = test_df_new['TotalBsmtSF'].fillna(test_df_new['TotalBsmtSF'].mean())
# test_df_new.loc[:, 'GarageCars'] = test_df_new['GarageCars'].fillna(test_df_new['GarageCars'].mean())
# test_df_new.loc[:, 'GarageArea'] = test_df_new['GarageArea'].fillna(test_df_new['GarageArea'].mean())

# test_df_new.loc[:, 'LotFrontage'] = test_df_new['LotFrontage'].fillna(test_df_new['LotFrontage'].mean())
# test_df_new.loc[:, 'MasVnrArea'] = test_df_new['MasVnrArea'].fillna(test_df_new['MasVnrArea'].mean())
test_df_new.loc[:, 'GarageYrBlt'] = test_df_new['GarageYrBlt'].fillna(test_df_new['GarageYrBlt'].mode().to_list()[0])
# test_df_new.loc[:, 'BsmtFinSF1'] = test_df_new['BsmtFinSF1'].fillna(test_df_new['BsmtFinSF1'].mean())

In [None]:
test_df_new['GarageYrBlt'].mode().to_list()[0]

In [None]:
test_df_new.loc[: ,imputer_cols] = pd.DataFrame(imputer.transform(test_df_new[imputer_cols]), columns=imputer_cols)

In [None]:
# test_df_new['MSZoning'].value_counts()
train_df_new['KitchenQual'].value_counts()

In [None]:
test_df_new.loc[:, 'MSZoning'] = test_df_new['MSZoning'].fillna('RL')
test_df_new.loc[:, 'KitchenQual'] = test_df_new['KitchenQual'].fillna('TA')

In [None]:
test_df_new.isna().sum().sum()

In [None]:
train_df_new.sample(3)

In [None]:
scaler = StandardScaler()
scaler_cols = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch']

for col in scaler_cols:
    train_df_new.loc[:,col] = pd.to_numeric(train_df_new[col])
    test_df_new.loc[:, col] = pd.to_numeric(test_df_new[col])

In [None]:
# import warnings
# warnings.filterwarnings('ignore')

train_df_new.loc[:, scaler_cols] = pd.DataFrame(scaler.fit_transform(train_df_new[scaler_cols]), columns=scaler_cols).astype(float)
test_df_new.loc[:, scaler_cols] = pd.DataFrame(scaler.transform(test_df_new[scaler_cols]), columns=scaler_cols).astype(float)

In [None]:
train_df_new.info()

In [None]:
train_df_new.loc[:, 'GrLivArea'] = np.log1p(10+train_df_new['GrLivArea'])
test_df_new.loc[:, 'GrLivArea'] = np.log1p(10+test_df_new['GrLivArea'])

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# one = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# # one_cols = ['LotShape', 'Foundation', 'MSZoning']
# one_cols = categorical_cols
# train_df_new = pd.concat([train_df_new, pd.DataFrame(one.fit_transform(train_df_new[one_cols]), columns=one.get_feature_names_out(one_cols))], axis=1)
# test_df_new = pd.concat([test_df_new, pd.DataFrame(one.transform(test_df_new[one_cols]), columns=one.get_feature_names_out(one_cols))], axis=1)

In [None]:
# train_df_new = train_df_new.drop(one_cols, axis=1)

In [None]:
# test_df_new = test_df_new.drop(one_cols, axis=1)

In [None]:
ond_cols = categorical_cols
ond = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=40)
train_df_new.loc[:, [f"{col}_e" for col in ond_cols]] = pd.DataFrame(ond.fit_transform(train_df_new[ond_cols]), columns=[f"{col}_e" for col in ond_cols])
test_df_new.loc[:, [f"{col}_e" for col in ond_cols]] = pd.DataFrame(ond.transform(test_df_new[ond_cols]), columns=[f"{col}_e" for col in ond_cols])

train_df_new = train_df_new.drop(ond_cols, axis=1)
test_df_new = test_df_new.drop(ond_cols, axis=1)

In [None]:
test_df_new.tail(3)

In [None]:
# ond_cols = ['KitchenQual']
# ond = OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']])
# train_df_new[ond_cols] = pd.DataFrame(ond.fit_transform(train_df_new[ond_cols]), columns=ond_cols)

In [None]:
# test_df_new[ond_cols] = pd.DataFrame(ond.transform(test_df_new[ond_cols]), columns=ond_cols)

In [None]:
# from sklearn.decomposition import PCA
# pc = PCA(n_components=50)
# train_df_pca = pd.concat([pd.DataFrame(pc.fit_transform(train_df_new.drop(['SalePrice'], axis=1))), train_df_new['SalePrice']], axis=1)
# test_df_pca = pd.concat([pd.DataFrame(pc.transform(test_df_new.drop(['Id'], axis=1))), test_df_new['Id']], axis=1)

In [None]:
# train_df_new = train_df_pca

In [None]:
# test_df_new = test_df_pca

In [None]:
from sklearn.metrics import mean_squared_error
def log_rmse(y, y_pred):
    return np.sqrt(mean_squared_error(np.log(y), np.log(y_pred)))

In [None]:
y = train_df_new['SalePrice']
x = train_df_new.drop(['SalePrice'], axis=1)

In [None]:
x.shape

In [None]:
y.describe()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2)

In [None]:
from sklearn.linear_model import LinearRegression
lg = LinearRegression()
lg.fit(x_train, y_train)
y_pred = lg.predict(x_test)
log_rmse(y_test, y_pred)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(x_train, y_train)
y_pred = gbr.predict(x_test)
log_rmse(y_test, y_pred)

In [None]:
y_test.describe()

In [None]:
y_test.shape

In [None]:
best_model = gbr

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
log_rmse(y_test, y_pred)

In [None]:
import xgboost as xgb
reg = xgb.XGBRegressor(learning_rate=0.01,
                       n_estimators=3000,
                       max_depth=3,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
log_rmse(y_test, y_pred)

In [None]:
# from sklearn.model_selection import GridSearchCV

# params = {
#     'max_depth': [3,5,7],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.7, .8, 1],
# }

# grid = GridSearchCV(xgb.XGBRegressor(), params, scoring="neg_mean_squared_error", n_jobs=-1, cv=3)
# grid.fit(x_train, y_train)
# y_pred = grid.best_estimator_.predict(x_test)
# log_rmse(y_test, y_pred)

In [None]:
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01)
las.fit(x_train, y_train)
y_pred = las.predict(x_test)
log_rmse(y_test, y_pred)

In [None]:
# from sklearn.svm import SVR
# sv = SVR()
# sv.fit(x_train, y_train)
# y_pred = sv.predict(x_test)
# log_rmse(y_test, y_pred)

In [None]:
best_model = las

In [None]:
id = test_df_new['Id']
test_df_x = test_df_new.drop(['Id'], axis=1)

df_sub = pd.DataFrame(
{
    'Id': id,
    'SalePrice': np.expm1(best_model.predict(test_df_x))
})

df_sub.to_csv('submission.csv', index=False)

In [None]:
df_sub.head()

In [None]:
raise SystemExit()

In [None]:
df_sub.head()

In [None]:
original = pd.read_csv("/kaggle/input/ames-housing/AmesHousing.csv")

In [None]:
original.head(1)

In [None]:
ids = id.to_list()

In [None]:
y_orig = original.loc[ids, 'SalePrice'].reset_index(drop=True)

In [None]:
log_rmse(y_orig, np.expm1(best_model.predict(test_df_x)))

In [None]:
y_orig

In [None]:
np.expm1(best_model.predict(test_df_x))