In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
#Importing the train and test data
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
df.head()
housing = df.copy()
housing_test = test_df.copy()

#Dropping Id as it contributes nothing and salesprice which is our target
df = df.drop(["Id","SalePrice"], axis=1)

#Dropping Id from test data
test_df = test_df.drop("Id", axis=1)

In [5]:
#Selecting the top 15 features for our model 
features = housing.corr(numeric_only=True)['SalePrice'].sort_values()
features_15 = features.sort_values(ascending=False).index[1:16]


In [6]:
# Splitting the numerical columns based on skewness
all_num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_skew = [col for col in all_num_cols if df[col].skew() < -1 or df[col].skew() > 1]
num_normal = [col for col in all_num_cols if col not in num_skew]

# Splittting the categorical columns based on no of unique features in them
cat_cols = df.select_dtypes(include='object').nunique().sort_values()
cat_col_ohe = [col for col, val in zip(cat_cols.index, cat_cols.values) if val<=9]
cat_col_ord = [col for col, val in zip(cat_cols.index, cat_cols.values) if val>=9]


In [7]:
#Creating pipelines for filling missing values
def pipeline():
    #pipeline for normal numerical columns
    num_pip = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ]) 
    
    #pipeline for skewed numeric columns
    num_skew_pip = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("skew", PowerTransformer(method='yeo-johnson', standardize=True)),
        ("scale", StandardScaler())
    ])

    #Pipeline for one hot encoder
    cat_ohe_pip = Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])

    # Pipeline for ordinal encoder
    cat_ord_pip = Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ('ord',  OrdinalEncoder())
    ])
   
    #Full pipeline consisting all ther pipelines
    full_pip = ColumnTransformer([
        ("num_col", num_pip, num_normal),
        ("num_skew", num_skew_pip, num_skew),
        ("ohe", cat_ohe_pip, cat_col_ohe),
        ("ord", cat_ord_pip, cat_col_ord)

    ])
    return full_pip 


In [8]:
#Transforming the data using the pipeline
full_pip = pipeline()
housing_prepared = full_pip.fit_transform(df)




In [9]:
#log Transforming the sales price as it is right skewed and high std
housing_labels = housing["SalePrice"].copy()
housing_price_transformed = np.log1p(housing_labels)

#Dict of models for testing purpouse
models = {"Linear Regression": LinearRegression(),
          "XG Boost": XGBRegressor(),
          "Decision Tree": DecisionTreeRegressor(),
          "Random Forest": RandomForestRegressor()}

# Calcualting RMSE scores for each model
for name, model in models.items():
    score = -cross_val_score(model,
                             housing_prepared,
                             housing_price_transformed,
                             scoring="neg_root_mean_squared_error",
                             cv=10
                               )
    print(f"The RMSE of {name} is {np.array(score).mean()}")

The RMSE of Linear Regression is 174858459.7585016
The RMSE of XG Boost is 0.13865930790582429
The RMSE of Decision Tree is 0.19677117371129688
The RMSE of Random Forest is 0.13846377647788483


In [10]:
# Hypterparameter tuning for the selected model
model = XGBRegressor(gamma=0, num_parallel_tree=2)
clf = GridSearchCV(model, {
    "num_parallel_tree":[1, 2, 3],
    "gamma":[0, 1, 2]
},
cv=5)
clf.fit(housing_prepared, housing_price_transformed)


In [11]:
# Now we will be training the model, preparing our test set and predict the house prices
housing_test_transformed = full_pip.transform(test_df)
model = XGBRegressor(gamma=0, num_parallel_tree=2)

#Training the model
model.fit(housing_prepared, housing_price_transformed)

#Predicted the log of house prices
house_price_log = model.predict(housing_test_transformed)


In [12]:
# Converting the log of sales prices to original values 
house_price_acc = np.exp(house_price_log)
submit = pd.DataFrame({"Id":housing_test["Id"].values,
                        "SalePrice":house_price_acc})

In [13]:
# Dumping the predicted values to a csv file 
submit.to_csv("submission.csv", index=False)
