##Importing libraries

In [1]:
import pandas as pd
import numpy as np

##Loading dataset

In [2]:
df = pd.read_csv('/content/train.csv')
df.drop(columns='Id', inplace=True)
y = df.SalePrice

##selecting categorical columns

In [3]:
dfo = df.select_dtypes(include=['object'])
dfo = dfo.dropna(axis=1)

##selecting numerical columns

In [4]:
numeric_lst = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
dfn = df.select_dtypes(include=numeric_lst)
dfn = dfn.drop(columns='SalePrice')

##adding categorical & numerical columns

In [5]:
final_df = pd.concat([dfo, dfn], axis=1)
X = final_df.iloc[:, :]

##using pipeline for encoding categorical data & scale numerical data

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_features = dfn.columns
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_features = dfo.columns
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

##regression

In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

regressor = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", XGBRegressor(objective ='reg:squarederror'))]
)

X_train, X_test, y_train, y_test = train_test_split(final_df, y, test_size=.2, random_state=41)

regressor.fit(X_train, y_train)
regressor.score(X_test, y_test)

0.9044951761991489

##cross validation and standard deviation score

In [8]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X, y = y, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 87.56 %
Standard Deviation: 4.20 %


##used features for model

In [9]:
dfn.append(dfo).columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'MSZoning', 'Street', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

#prediction

In [10]:
regressor.predict(pd.read_csv('/content/train.csv', usecols=dfn.append(dfo)).iloc[4:5, :])[0]

293122.3

In [11]:
sub_df = pd.read_csv('/content/sample_submission.csv')
sub_df.drop(columns='Id', inplace=True)
test_df = pd.read_csv('/content/test.csv', usecols = dfn.append(dfo).columns)

pd.concat([sub_df, pd.DataFrame(regressor.predict(test_df), columns=['predicted'])], axis=1)

FileNotFoundError: ignored

In [None]:
regressor.score(test_df, sub_df)