# House price prediction using advance regression techniques

In [None]:
#Importing relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Loading training and testing data
train = pd.read_csv('C:/Users/Home/Downloads/ames.csv')
test = pd.read_csv('C:/Users/Home/Documents/UNCC/Big data/Housing price project/test_org.csv')

In [None]:
#Exploring the train and test data 
train.info()
test.info()

In [None]:
#Observing the null values for the training data
train.isnull().sum()

In [None]:
#Dropping the variables with higher numer of null values
train = train.drop(columns = ['Alley', 'PoolQC','Fence','MiscFeature', 'GarageYrBlt'],axis=1)
test = test.drop(columns = ['Alley', 'PoolQC','Fence','MiscFeature', 'GarageYrBlt'],axis=1)

In [None]:
#Using the lambda function on training data to fill the null values with mean and mode of the variables for numeric and categorical data respectively
train = train.apply(lambda x:x.fillna(x.mean())
                 if x.dtype=='float'
                 else x.fillna(x.value_counts().index[0]))

In [None]:
##Using the lambda function on test data to fill the null values with mean and mode of the variables for numeric and categorical data respectively
test = test.apply(lambda x:x.fillna(x.mean())
                 if x.dtype=='float'
                 else x.fillna(x.value_counts().index[0]))

In [None]:
train.info()
test.info()

In [None]:
#Considering only rhe categorical variables from both the train and test data
train_cat = train.select_dtypes(exclude=[np.number])
test_cat = test.select_dtypes(exclude=[np.number])

In [None]:
#Creating dummies for categorical variables to convert them to binary values
train_cat = pd.get_dummies(train_cat, drop_first = True)
test_cat = pd.get_dummies(test_cat, drop_first = True)

In [None]:
#Checking the data after creating dummies
train_cat.info()
test_cat.info()

In [None]:
# Considering only common variables in both the data sets
common_columns = [col for col in train_cat.columns if col in test_cat.columns]
# keep only common columns from df1 and df2
train_cat = train_cat[common_columns]
test_cat = test_cat[common_columns]

In [None]:
#Considering only the numeric variables
train_num = train.select_dtypes(include=[np.number])
test_num = test.select_dtypes(include=[np.number])

In [None]:
#Concatenating the train and test data sets to create the final dataframes for train and test data
train = pd.concat([train_num,train_cat], axis=1)
test = pd.concat([test_num,test_cat], axis=1)

In [None]:
#Observing the correlation between the SalePrice and other variables for the training data
corr = train.corr()
corr = pd.DataFrame(corr)
corr.info()
corr

In [None]:
#Converting the correlation matrix to a csv file for further analysis
corr.to_csv('corr.csv', index=True, header=True)

In [None]:
#Dropping the variables having very low correlations with the SalePrice
train = train.drop(columns = ['Neighborhood_SawyerW', 'HouseStyle_2.5Unf', 'RoofStyle_Shed', 'RoofMatl_Tar&Grv', 'Foundation_Wood','SaleType_CWD', 'SaleType_ConLI'],axis=1)
test = test.drop(columns = ['Neighborhood_SawyerW', 'HouseStyle_2.5Unf', 'RoofStyle_Shed', 'RoofMatl_Tar&Grv', 'Foundation_Wood','SaleType_CWD', 'SaleType_ConLI'],axis=1)

In [None]:
#Preparing the training and testing data sets
x_train = train.drop(columns = ["SalePrice"], axis=1)
y_train = train['SalePrice']
x_test = test.drop(columns = ["Id"], axis=1)

In [None]:
#Importing XGBoost library
import xgboost as xgb

In [None]:
#Initializing the parameters
booster=['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]

In [None]:
#Creating the regressor object
regressor=xgb.XGBRegressor()

In [None]:
## Hyper Parameter Optimization
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }


In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [None]:
random_cv.fit(x_train,y_train)

In [None]:
#Selecting the best estimator
random_cv.best_estimator_

In [None]:
regressor=xgb.XGBRegressor(base_score=0.25, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=900,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [None]:
#Fitting the XGBoost model to the train data
regressor.fit(x_train,y_train)

In [None]:
#Predicting saleprice
y_test = regressor.predict(x_test)

In [None]:
y_test

In [None]:
pred_saleprice5 = pd.DataFrame(y_test, columns = ['SalePrice'])
print(pred_saleprice5)

In [None]:
pred_saleprice5.to_csv('predicted_sp_xgb5.csv', index=False, header=True)

In [None]:
#Importing the random forest module
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Creating the random forest regressor object
rf_reg = RandomForestRegressor(n_estimators = 1000, n_jobs=-1, random_state = 42)

In [None]:
#Fitting the model
rf_reg.fit(x_train, y_train)

In [None]:
#Predicting Saleprice
y_test = rf_reg.predict(x_test)

In [None]:
y_test

In [None]:
pred_saleprice3 = pd.DataFrame(y_test, columns = ['SalePrice'])
print(pred_saleprice3)

In [None]:
pred_saleprice3.to_csv('predicted_sp_rf.csv', index=False, header=True)